Skip to content

Commit 46f834e

Browse files
authored
fix: use unique names for test training jobs (#765)
1 parent 16b81d7 commit 46f834e

File tree

6 files changed

+39
-18
lines changed

6 files changed

+39
-18
lines changed

tests/integ/test_rl.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import pytest
1919

2020
from sagemaker.rl import RLEstimator, RLFramework, RLToolkit
21-
from sagemaker.utils import sagemaker_timestamp
21+
from sagemaker.utils import sagemaker_timestamp, unique_name_from_base
2222
from tests.integ import DATA_DIR, PYTHON_VERSION
2323
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2424

@@ -29,9 +29,10 @@
2929
@pytest.mark.skipif(PYTHON_VERSION != 'py3', reason="RL images supports only Python 3.")
3030
def test_coach_mxnet(sagemaker_session, rl_coach_mxnet_full_version):
3131
estimator = _test_coach(sagemaker_session, RLFramework.MXNET, rl_coach_mxnet_full_version)
32+
job_name = unique_name_from_base('test-coach-mxnet')
3233

3334
with timeout(minutes=15):
34-
estimator.fit(wait='False')
35+
estimator.fit(wait='False', job_name=job_name)
3536

3637
estimator = RLEstimator.attach(estimator.latest_training_job.name,
3738
sagemaker_session=sagemaker_session)
@@ -52,9 +53,10 @@ def test_coach_mxnet(sagemaker_session, rl_coach_mxnet_full_version):
5253
@pytest.mark.skipif(PYTHON_VERSION != 'py3', reason="RL images supports only Python 3.")
5354
def test_coach_tf(sagemaker_session, rl_coach_tf_full_version):
5455
estimator = _test_coach(sagemaker_session, RLFramework.TENSORFLOW, rl_coach_tf_full_version)
56+
job_name = unique_name_from_base('test-coach-tf')
5557

5658
with timeout(minutes=15):
57-
estimator.fit()
59+
estimator.fit(job_name=job_name)
5860

5961
endpoint_name = 'test-tf-coach-deploy-{}'.format(sagemaker_timestamp())
6062

@@ -104,9 +106,10 @@ def test_ray_tf(sagemaker_session, rl_ray_full_version):
104106
role='SageMakerRole',
105107
train_instance_type=CPU_INSTANCE,
106108
train_instance_count=1)
109+
job_name = unique_name_from_base('test-ray-tf')
107110

108111
with timeout(minutes=15):
109-
estimator.fit()
112+
estimator.fit(job_name=job_name)
110113

111114
with pytest.raises(NotImplementedError) as e:
112115
estimator.deploy(1, CPU_INSTANCE)

tests/integ/test_sklearn_train.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from sagemaker.sklearn.defaults import SKLEARN_VERSION
2222
from sagemaker.sklearn import SKLearn
2323
from sagemaker.sklearn import SKLearnModel
24-
from sagemaker.utils import sagemaker_timestamp
24+
from sagemaker.utils import sagemaker_timestamp, unique_name_from_base
2525
from tests.integ import DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

@@ -49,8 +49,9 @@ def test_training_with_additional_hyperparameters(sagemaker_session, sklearn_ful
4949
key_prefix='integ-test-data/sklearn_mnist/train')
5050
test_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
5151
key_prefix='integ-test-data/sklearn_mnist/test')
52+
job_name = unique_name_from_base('test-sklearn-hp')
5253

53-
sklearn.fit({'train': train_input, 'test': test_input})
54+
sklearn.fit({'train': train_input, 'test': test_input}, job_name=job_name)
5455
return sklearn.latest_training_job.name
5556

5657

@@ -109,9 +110,10 @@ def test_failed_training_job(sagemaker_session, sklearn_full_version):
109110

110111
train_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
111112
key_prefix='integ-test-data/sklearn_mnist/train')
113+
job_name = unique_name_from_base('test-sklearn-failed')
112114

113115
with pytest.raises(ValueError):
114-
sklearn.fit(train_input)
116+
sklearn.fit(train_input, job_name=job_name)
115117

116118

117119
def _run_mnist_training_job(sagemaker_session, instance_type, sklearn_full_version, wait=True):
@@ -130,8 +132,9 @@ def _run_mnist_training_job(sagemaker_session, instance_type, sklearn_full_versi
130132
key_prefix='integ-test-data/sklearn_mnist/train')
131133
test_input = sklearn.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
132134
key_prefix='integ-test-data/sklearn_mnist/test')
135+
job_name = unique_name_from_base('test-sklearn-mnist')
133136

134-
sklearn.fit({'train': train_input, 'test': test_input}, wait=wait)
137+
sklearn.fit({'train': train_input, 'test': test_input}, wait=wait, job_name=job_name)
135138
return sklearn.latest_training_job.name
136139

137140

tests/integ/test_tf.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
import tests.integ
2121
from sagemaker.tensorflow import TensorFlow, TensorFlowModel
22-
from sagemaker.utils import sagemaker_timestamp
22+
from sagemaker.utils import sagemaker_timestamp, unique_name_from_base
2323
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, PYTHON_VERSION
2424
from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
2525
from tests.integ.vpc_test_utils import get_or_create_vpc_resources, setup_security_group_for_encryption
@@ -46,7 +46,8 @@ def tf_training_job(sagemaker_session, tf_full_version):
4646
base_job_name='test-tf')
4747

4848
inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
49-
estimator.fit(inputs)
49+
job_name = unique_name_from_base('test-tf-train')
50+
estimator.fit(inputs, job_name=job_name)
5051
print('job succeeded: {}'.format(estimator.latest_training_job.name))
5152

5253
return estimator.latest_training_job.name
@@ -123,7 +124,8 @@ def test_tf_async(sagemaker_session):
123124
base_job_name='test-tf')
124125

125126
inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
126-
estimator.fit(inputs, wait=False)
127+
job_name = unique_name_from_base('test-tf-async')
128+
estimator.fit(inputs, wait=False, job_name=job_name)
127129
training_job_name = estimator.latest_training_job.name
128130
time.sleep(20)
129131

@@ -166,9 +168,10 @@ def test_tf_vpc_multi(sagemaker_session, tf_full_version):
166168
subnets=subnet_ids,
167169
security_group_ids=[security_group_id],
168170
encrypt_inter_container_traffic=True)
171+
job_name = unique_name_from_base('test-tf-vpc-multi')
169172

170173
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
171-
estimator.fit(train_input)
174+
estimator.fit(train_input, job_name=job_name)
172175
print('training job succeeded: {}'.format(estimator.latest_training_job.name))
173176

174177
job_desc = sagemaker_session.sagemaker_client.describe_training_job(
@@ -209,7 +212,8 @@ def test_failed_tf_training(sagemaker_session, tf_full_version):
209212
train_instance_count=1,
210213
train_instance_type='ml.c4.xlarge',
211214
sagemaker_session=sagemaker_session)
215+
job_name = unique_name_from_base('test-tf-fail')
212216

213217
with pytest.raises(ValueError) as e:
214-
estimator.fit()
218+
estimator.fit(job_name=job_name)
215219
assert 'This failure is expected' in str(e.value)

tests/integ/test_tf_cifar.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
2323

2424
from sagemaker.tensorflow import TensorFlow
25+
from sagemaker.utils import unique_name_from_base
2526

2627
PICKLE_CONTENT_TYPE = 'application/python-pickle'
2728

@@ -55,7 +56,9 @@ def test_cifar(sagemaker_session, tf_full_version):
5556

5657
inputs = estimator.sagemaker_session.upload_data(path=dataset_path,
5758
key_prefix='data/cifar10')
58-
estimator.fit(inputs, logs=False)
59+
job_name = unique_name_from_base('test-tf-cifar')
60+
61+
estimator.fit(inputs, logs=False, job_name=job_name)
5962
print('job succeeded: {}'.format(estimator.latest_training_job.name))
6063

6164
endpoint_name = estimator.latest_training_job.name

tests/integ/test_tf_keras.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
2222

2323
from sagemaker.tensorflow import TensorFlow
24+
from sagemaker.utils import unique_name_from_base
2425

2526

2627
@pytest.mark.canary_quick
@@ -43,8 +44,9 @@ def test_keras(sagemaker_session, tf_full_version):
4344

4445
inputs = estimator.sagemaker_session.upload_data(path=dataset_path,
4546
key_prefix='data/cifar10')
47+
job_name = unique_name_from_base('test-tf-keras')
4648

47-
estimator.fit(inputs)
49+
estimator.fit(inputs, job_name=job_name)
4850

4951
endpoint_name = estimator.latest_training_job.name
5052
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):

tests/integ/test_transformer.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from sagemaker import KMeans
2323
from sagemaker.mxnet import MXNet
2424
from sagemaker.transformer import Transformer
25+
from sagemaker.utils import unique_name_from_base
2526
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, TRANSFORM_DEFAULT_TIMEOUT_MINUTES
2627
from tests.integ.kms_utils import get_or_create_kms_key
2728
from tests.integ.timeout import timeout, timeout_and_delete_model_with_transformer
@@ -41,9 +42,10 @@ def test_transform_mxnet(sagemaker_session, mxnet_full_version):
4142
key_prefix='integ-test-data/mxnet_mnist/train')
4243
test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
4344
key_prefix='integ-test-data/mxnet_mnist/test')
45+
job_name = unique_name_from_base('test-mxnet-transform')
4446

4547
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
46-
mx.fit({'train': train_input, 'test': test_input})
48+
mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
4749

4850
transform_input_path = os.path.join(data_path, 'transform', 'data.csv')
4951
transform_input_key_prefix = 'integ-test-data/mxnet_mnist/transform'
@@ -86,8 +88,11 @@ def test_attach_transform_kmeans(sagemaker_session):
8688
kmeans.epochs = 1
8789

8890
records = kmeans.record_set(train_set[0][:100])
91+
92+
job_name = unique_name_from_base('test-kmeans-attach')
93+
8994
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
90-
kmeans.fit(records)
95+
kmeans.fit(records, job_name=job_name)
9196

9297
transform_input_path = os.path.join(data_path, 'transform_input.csv')
9398
transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform'
@@ -120,9 +125,10 @@ def test_transform_mxnet_vpc(sagemaker_session, mxnet_full_version):
120125
key_prefix='integ-test-data/mxnet_mnist/train')
121126
test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
122127
key_prefix='integ-test-data/mxnet_mnist/test')
128+
job_name = unique_name_from_base('test-mxnet-vpc')
123129

124130
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
125-
mx.fit({'train': train_input, 'test': test_input})
131+
mx.fit({'train': train_input, 'test': test_input}, job_name=job_name)
126132

127133
job_desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=mx.latest_training_job.name)
128134
assert set(subnet_ids) == set(job_desc['VpcConfig']['Subnets'])

0 commit comments

Comments
 (0)