fix: use unique names for test training jobs (#765)

chuyang-deng · web-flow · commit 46f834e483d8 · 2019-04-25T10:06:37.000-07:00
diff --git a/tests/integ/test_rl.py b/tests/integ/test_rl.py
@@ -18,7 +18,7 @@
 import pytest
 
 from sagemaker.rl import RLEstimator, RLFramework, RLToolkit
-from sagemaker.utils import sagemaker_timestamp
+from sagemaker.utils import sagemaker_timestamp, unique_name_from_base
 from tests.integ import DATA_DIR, PYTHON_VERSION
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
@@ -29,9 +29,10 @@
 @pytest.mark.skipif(PYTHON_VERSION != 'py3', reason="RL images supports only Python 3.")
 def test_coach_mxnet(sagemaker_session, rl_coach_mxnet_full_version):
     estimator = _test_coach(sagemaker_session, RLFramework.MXNET, rl_coach_mxnet_full_version)
+    job_name = unique_name_from_base('test-coach-mxnet')
 
     with timeout(minutes=15):
-        estimator.fit(wait='False')
+        estimator.fit(wait='False', job_name=job_name)
 
         estimator = RLEstimator.attach(estimator.latest_training_job.name,
                                        sagemaker_session=sagemaker_session)
@@ -52,9 +53,10 @@ def test_coach_mxnet(sagemaker_session, rl_coach_mxnet_full_version):
 @pytest.mark.skipif(PYTHON_VERSION != 'py3', reason="RL images supports only Python 3.")
 def test_coach_tf(sagemaker_session, rl_coach_tf_full_version):
     estimator = _test_coach(sagemaker_session, RLFramework.TENSORFLOW, rl_coach_tf_full_version)
+    job_name = unique_name_from_base('test-coach-tf')
 
     with timeout(minutes=15):
-        estimator.fit()
+        estimator.fit(job_name=job_name)
 
     endpoint_name = 'test-tf-coach-deploy-{}'.format(sagemaker_timestamp())
 
@@ -104,9 +106,10 @@ def test_ray_tf(sagemaker_session, rl_ray_full_version):
                             role='SageMakerRole',
                             train_instance_type=CPU_INSTANCE,
                             train_instance_count=1)
+    job_name = unique_name_from_base('test-ray-tf')
 
     with timeout(minutes=15):
-        estimator.fit()
+        estimator.fit(job_name=job_name)
 
     with pytest.raises(NotImplementedError) as e:
         estimator.deploy(1, CPU_INSTANCE)
diff --git a/tests/integ/test_sklearn_train.py b/tests/integ/test_sklearn_train.py
@@ -21,7 +21,7 @@
 from sagemaker.sklearn.defaults import SKLEARN_VERSION
 from sagemaker.sklearn import SKLearn
 from sagemaker.sklearn import SKLearnModel
-from sagemaker.utils import sagemaker_timestamp
+from sagemaker.utils import sagemaker_timestamp, unique_name_from_base
 from tests.integ import DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
@@ -49,8 +49,9 @@ def test_training_with_additional_hyperparameters(sagemaker_session, sklearn_ful
                                                             key_prefix='integ-test-data/sklearn_mnist/train')
         test_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                            key_prefix='integ-test-data/sklearn_mnist/test')
+        job_name = unique_name_from_base('test-sklearn-hp')
 
-        sklearn.fit({'train': train_input, 'test': test_input})
+        sklearn.fit({'train': train_input, 'test': test_input}, job_name=job_name)
         return sklearn.latest_training_job.name
 
 
@@ -109,9 +110,10 @@ def test_failed_training_job(sagemaker_session, sklearn_full_version):
 
         train_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                             key_prefix='integ-test-data/sklearn_mnist/train')
+        job_name = unique_name_from_base('test-sklearn-failed')
 
         with pytest.raises(ValueError):
-            sklearn.fit(train_input)
+            sklearn.fit(train_input, job_name=job_name)
 
 
 def _run_mnist_training_job(sagemaker_session, instance_type, sklearn_full_version, wait=True):
@@ -130,8 +132,9 @@ def _run_mnist_training_job(sagemaker_session, instance_type, sklearn_full_versi
                                                             key_prefix='integ-test-data/sklearn_mnist/train')
         test_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                            key_prefix='integ-test-data/sklearn_mnist/test')
+        job_name = unique_name_from_base('test-sklearn-mnist')
 
-        sklearn.fit({'train': train_input, 'test': test_input}, wait=wait)
+        sklearn.fit({'train': train_input, 'test': test_input}, wait=wait, job_name=job_name)
         return sklearn.latest_training_job.name
 
 
diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py
@@ -19,7 +19,7 @@
 
 import tests.integ
 from sagemaker.tensorflow import TensorFlow, TensorFlowModel
-from sagemaker.utils import sagemaker_timestamp
+from sagemaker.utils import sagemaker_timestamp, unique_name_from_base
 from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, PYTHON_VERSION
 from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
 from tests.integ.vpc_test_utils import get_or_create_vpc_resources, setup_security_group_for_encryption
@@ -46,7 +46,8 @@ def tf_training_job(sagemaker_session, tf_full_version):
                                base_job_name='test-tf')
 
         inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
-        estimator.fit(inputs)
+        job_name = unique_name_from_base('test-tf-train')
+        estimator.fit(inputs, job_name=job_name)
         print('job succeeded: {}'.format(estimator.latest_training_job.name))
 
         return estimator.latest_training_job.name
@@ -123,7 +124,8 @@ def test_tf_async(sagemaker_session):
                                base_job_name='test-tf')
 
         inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
-        estimator.fit(inputs, wait=False)
+        job_name = unique_name_from_base('test-tf-async')
+        estimator.fit(inputs, wait=False, job_name=job_name)
         training_job_name = estimator.latest_training_job.name
         time.sleep(20)
 
@@ -166,9 +168,10 @@ def test_tf_vpc_multi(sagemaker_session, tf_full_version):
                            subnets=subnet_ids,
                            security_group_ids=[security_group_id],
                            encrypt_inter_container_traffic=True)
+    job_name = unique_name_from_base('test-tf-vpc-multi')
 
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
-        estimator.fit(train_input)
+        estimator.fit(train_input, job_name=job_name)
         print('training job succeeded: {}'.format(estimator.latest_training_job.name))
 
     job_desc = sagemaker_session.sagemaker_client.describe_training_job(
@@ -209,7 +212,8 @@ def test_failed_tf_training(sagemaker_session, tf_full_version):
                                train_instance_count=1,
                                train_instance_type='ml.c4.xlarge',
                                sagemaker_session=sagemaker_session)
+        job_name = unique_name_from_base('test-tf-fail')
 
         with pytest.raises(ValueError) as e:
-            estimator.fit()
+            estimator.fit(job_name=job_name)
         assert 'This failure is expected' in str(e.value)
diff --git a/tests/integ/test_tf_cifar.py b/tests/integ/test_tf_cifar.py
@@ -22,6 +22,7 @@
 from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
 
 from sagemaker.tensorflow import TensorFlow
+from sagemaker.utils import unique_name_from_base
 
 PICKLE_CONTENT_TYPE = 'application/python-pickle'
 
@@ -55,7 +56,9 @@ def test_cifar(sagemaker_session, tf_full_version):
 
         inputs = estimator.sagemaker_session.upload_data(path=dataset_path,
                                                          key_prefix='data/cifar10')
-        estimator.fit(inputs, logs=False)
+        job_name = unique_name_from_base('test-tf-cifar')
+
+        estimator.fit(inputs, logs=False, job_name=job_name)
         print('job succeeded: {}'.format(estimator.latest_training_job.name))
 
     endpoint_name = estimator.latest_training_job.name
diff --git a/tests/integ/test_tf_keras.py b/tests/integ/test_tf_keras.py
@@ -21,6 +21,7 @@
 from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
 
 from sagemaker.tensorflow import TensorFlow
+from sagemaker.utils import unique_name_from_base
 
 
 @pytest.mark.canary_quick
@@ -43,8 +44,9 @@ def test_keras(sagemaker_session, tf_full_version):
 
         inputs = estimator.sagemaker_session.upload_data(path=dataset_path,
                                                          key_prefix='data/cifar10')
+        job_name = unique_name_from_base('test-tf-keras')
 
-        estimator.fit(inputs)
+        estimator.fit(inputs, job_name=job_name)
 
     endpoint_name = estimator.latest_training_job.name
     with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py
@@ -22,6 +22,7 @@
 from sagemaker import KMeans
 from sagemaker.mxnet import MXNet
 from sagemaker.transformer import Transformer
+from sagemaker.utils import unique_name_from_base
 from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, TRANSFORM_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.kms_utils import get_or_create_kms_key
 from tests.integ.timeout import timeout, timeout_and_delete_model_with_transformer
@@ -41,9 +42,10 @@ def test_transform_mxnet(sagemaker_session, mxnet_full_version):
                                                    key_prefix='integ-test-data/mxnet_mnist/train')
     test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                   key_prefix='integ-test-data/mxnet_mnist/test')
+    job_name = unique_name_from_base('test-mxnet-transform')
 
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
-        mx.fit({'train': train_input, 'test': test_input})
+        mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
 
     transform_input_path = os.path.join(data_path, 'transform', 'data.csv')
     transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform'
@@ -86,8 +88,11 @@ def test_attach_transform_kmeans(sagemaker_session):
     kmeans.epochs = 1
 
     records = kmeans.record_set(train_set[0][:100])
+
+    job_name = unique_name_from_base('test-kmeans-attach')
+
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
-        kmeans.fit(records)
+        kmeans.fit(records, job_name=job_name)
 
     transform_input_path = os.path.join(data_path, 'transform_input.csv')
     transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
@@ -120,9 +125,10 @@ def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version):
                                                    key_prefix='integ-test-data/mxnet_mnist/train')
     test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                   key_prefix='integ-test-data/mxnet_mnist/test')
+    job_name = unique_name_from_base('test-mxnet-vpc')
 
     with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
-        mx.fit({'train': train_input, 'test': test_input})
+        mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
 
     job_desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=mx.latest_training_job.name)
     assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets'])