Skip to content

Commit e3dc3b5

Browse files
authored
Wrap training timeout for integ tests (#339)
1 parent fa88325 commit e3dc3b5

15 files changed

+44
-41
lines changed

tests/integ/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
import os
1717

1818
DATA_DIR = os.path.join(os.path.dirname(__file__), '..', 'data')
19+
TRAINING_DEFAULT_TIMEOUT_MINUTES = 20
20+
TUNING_DEFAULT_TIMEOUT_MINUTES = 20
1921

2022
logging.getLogger('boto3').setLevel(logging.INFO)
2123
logging.getLogger('botocore').setLevel(logging.INFO)

tests/integ/test_byo_estimator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from sagemaker.amazon.amazon_estimator import registry
2525
from sagemaker.estimator import Estimator
2626
from sagemaker.utils import name_from_base
27-
from tests.integ import DATA_DIR
27+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2828
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2929

3030

@@ -55,7 +55,7 @@ def test_byo_estimator(sagemaker_session, region):
5555
image_name = registry(region) + "/factorization-machines:1"
5656
training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
5757

58-
with timeout(minutes=15):
58+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
5959
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
6060
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
6161

tests/integ/test_chainer_train.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from sagemaker.chainer.estimator import Chainer
2323
from sagemaker.chainer.model import ChainerModel
2424
from sagemaker.utils import sagemaker_timestamp
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

@@ -40,7 +40,7 @@ def test_distributed_gpu_training(sagemaker_session, chainer_full_version):
4040

4141

4242
def test_training_with_additional_hyperparameters(sagemaker_session, chainer_full_version):
43-
with timeout(minutes=15):
43+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
4444
script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
4545
data_path = os.path.join(DATA_DIR, 'chainer_mnist')
4646

@@ -101,7 +101,7 @@ def test_async_fit(sagemaker_session):
101101

102102

103103
def test_failed_training_job(sagemaker_session, chainer_full_version):
104-
with timeout(minutes=15):
104+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
105105
script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py')
106106
data_path = os.path.join(DATA_DIR, 'chainer_mnist')
107107

@@ -119,7 +119,7 @@ def test_failed_training_job(sagemaker_session, chainer_full_version):
119119

120120
def _run_mnist_training_job(sagemaker_session, instance_type, instance_count,
121121
chainer_full_version, wait=True):
122-
with timeout(minutes=15):
122+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
123123

124124
script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py') if instance_type == 1 else \
125125
os.path.join(DATA_DIR, 'chainer_mnist', 'distributed_mnist.py')

tests/integ/test_factorization_machines.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222

2323
from sagemaker import FactorizationMachines, FactorizationMachinesModel
2424
from sagemaker.utils import name_from_base
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

2929
@pytest.mark.continuous_testing
3030
def test_factorization_machines(sagemaker_session):
31-
with timeout(minutes=15):
31+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3232
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3333
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3434

tests/integ/test_kmeans.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222

2323
from sagemaker import KMeans, KMeansModel
2424
from sagemaker.utils import name_from_base
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

2929
@pytest.mark.continuous_testing
3030
def test_kmeans(sagemaker_session):
31-
with timeout(minutes=15):
31+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3232
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3333
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3434

tests/integ/test_knn.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222

2323
from sagemaker import KNN, KNNModel
2424
from sagemaker.utils import name_from_base
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

2929
@pytest.mark.continuous_testing
3030
def test_knn_regressor(sagemaker_session):
31-
with timeout(minutes=15):
31+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3232
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3333
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3434

tests/integ/test_lda.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@
2020
from sagemaker import LDA, LDAModel
2121
from sagemaker.amazon.common import read_records
2222
from sagemaker.utils import name_from_base
23-
from tests.integ import DATA_DIR
23+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2424
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2525
from tests.integ.record_set import prepare_record_set_from_local_files
2626

2727

2828
@pytest.mark.continuous_testing
2929
def test_lda(sagemaker_session):
30-
with timeout(minutes=15):
30+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3131
data_path = os.path.join(DATA_DIR, 'lda')
3232
data_filename = 'nips-train_1.pbr'
3333

tests/integ/test_linear_learner.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323

2424
from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel
2525
from sagemaker.utils import name_from_base, sagemaker_timestamp
26-
from tests.integ import DATA_DIR
26+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2727
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2828

2929

3030
@pytest.mark.continuous_testing
3131
def test_linear_learner(sagemaker_session):
32-
with timeout(minutes=15):
32+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3333
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3434
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3535

@@ -93,7 +93,7 @@ def test_linear_learner(sagemaker_session):
9393

9494

9595
def test_linear_learner_multiclass(sagemaker_session):
96-
with timeout(minutes=15):
96+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
9797
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
9898
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
9999

@@ -125,7 +125,7 @@ def test_async_linear_learner(sagemaker_session):
125125
training_job_name = ""
126126
endpoint_name = 'test-linear-learner-async-{}'.format(sagemaker_timestamp())
127127

128-
with timeout(minutes=5):
128+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
129129
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
130130
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
131131

tests/integ/test_mxnet_train.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@
2121
from sagemaker.mxnet.estimator import MXNet
2222
from sagemaker.mxnet.model import MXNetModel
2323
from sagemaker.utils import sagemaker_timestamp
24-
from tests.integ import DATA_DIR
24+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2525
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2626

2727

2828
@pytest.fixture(scope='module')
2929
def mxnet_training_job(sagemaker_session, mxnet_full_version):
30-
with timeout(minutes=15):
30+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3131
script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
3232
data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
3333

@@ -100,7 +100,7 @@ def test_async_fit(sagemaker_session):
100100

101101

102102
def test_failed_training_job(sagemaker_session, mxnet_full_version):
103-
with timeout(minutes=15):
103+
with timeout():
104104
script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py')
105105
data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
106106

tests/integ/test_ntm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@
2020
from sagemaker import NTM, NTMModel
2121
from sagemaker.amazon.common import read_records
2222
from sagemaker.utils import name_from_base
23-
from tests.integ import DATA_DIR
23+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2424
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2525
from tests.integ.record_set import prepare_record_set_from_local_files
2626

2727

2828
@pytest.mark.continuous_testing
2929
def test_ntm(sagemaker_session):
30-
with timeout(minutes=15):
30+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3131
data_path = os.path.join(DATA_DIR, 'ntm')
3232
data_filename = 'nips-train_1.pbr'
3333

tests/integ/test_pca.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@
2222

2323
import sagemaker.amazon.pca
2424
from sagemaker.utils import name_from_base
25-
from tests.integ import DATA_DIR
25+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2626
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2727

2828

2929
@pytest.mark.continuous_testing
3030
def test_pca(sagemaker_session):
31-
with timeout(minutes=15):
31+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3232
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
3333
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
3434

tests/integ/test_pytorch_train.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from sagemaker.pytorch.estimator import PyTorch
2020
from sagemaker.pytorch.model import PyTorchModel
2121
from sagemaker.utils import sagemaker_timestamp
22-
from tests.integ import DATA_DIR
22+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2323
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2424

2525
MNIST_DIR = os.path.join(DATA_DIR, 'pytorch_mnist')
@@ -30,7 +30,7 @@
3030
@pytest.fixture(scope='module', name='pytorch_training_job')
3131
def fixture_training_job(sagemaker_session, pytorch_full_version):
3232
instance_type = 'ml.c4.xlarge'
33-
with timeout(minutes=15):
33+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3434
pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type)
3535

3636
pytorch.fit({'training': _upload_training_data(pytorch)})
@@ -103,7 +103,7 @@ def test_async_fit_deploy(sagemaker_session, pytorch_full_version):
103103
def test_failed_training_job(sagemaker_session, pytorch_full_version):
104104
script_path = os.path.join(MNIST_DIR, 'failure_script.py')
105105

106-
with timeout(minutes=15):
106+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
107107
pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, entry_point=script_path)
108108

109109
with pytest.raises(ValueError) as e:

tests/integ/test_randomcutforest.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,13 @@
1717

1818
from sagemaker import RandomCutForest, RandomCutForestModel
1919
from sagemaker.utils import name_from_base
20+
from tests.integ import TRAINING_DEFAULT_TIMEOUT_MINUTES
2021
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
2122

2223

2324
@pytest.mark.continuous_testing
2425
def test_randomcutforest(sagemaker_session):
25-
with timeout(minutes=15):
26+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
2627
# Generate a thousand 14-dimensional datapoints.
2728
feature_num = 14
2829
train_input = np.random.rand(1000, feature_num)

tests/integ/test_tf.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,15 @@
1818
import pytest
1919

2020
from sagemaker.tensorflow import TensorFlow
21-
from tests.integ import DATA_DIR
21+
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2222
from tests.integ.timeout import timeout_and_delete_endpoint_by_name, timeout
2323

2424
DATA_PATH = os.path.join(DATA_DIR, 'iris', 'data')
2525

2626

2727
@pytest.mark.continuous_testing
2828
def test_tf(sagemaker_session, tf_full_version):
29-
with timeout(minutes=15):
29+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
3030
script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')
3131

3232
estimator = TensorFlow(entry_point=script_path,
@@ -59,7 +59,7 @@ def test_tf(sagemaker_session, tf_full_version):
5959

6060

6161
def test_tf_async(sagemaker_session):
62-
with timeout(minutes=5):
62+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
6363
script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')
6464

6565
estimator = TensorFlow(entry_point=script_path,
@@ -88,7 +88,7 @@ def test_tf_async(sagemaker_session):
8888

8989

9090
def test_failed_tf_training(sagemaker_session, tf_full_version):
91-
with timeout(minutes=15):
91+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
9292
script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py')
9393
estimator = TensorFlow(entry_point=script_path,
9494
role='SageMakerRole',

tests/integ/test_tuner.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from sagemaker.pytorch import PyTorch
3333
from sagemaker.tensorflow import TensorFlow
3434
from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter, HyperparameterTuner
35-
from tests.integ import DATA_DIR
35+
from tests.integ import DATA_DIR, TUNING_DEFAULT_TIMEOUT_MINUTES
3636
from tests.integ.record_set import prepare_record_set_from_local_files
3737
from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
3838

@@ -41,7 +41,7 @@
4141

4242
@pytest.mark.continuous_testing
4343
def test_tuning_kmeans(sagemaker_session):
44-
with timeout(minutes=20):
44+
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
4545
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
4646
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
4747

@@ -96,7 +96,7 @@ def test_tuning_kmeans(sagemaker_session):
9696

9797

9898
def test_tuning_lda(sagemaker_session):
99-
with timeout(minutes=20):
99+
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
100100
data_path = os.path.join(DATA_DIR, 'lda')
101101
data_filename = 'nips-train_1.pbr'
102102

@@ -182,7 +182,7 @@ def test_stop_tuning_job(sagemaker_session):
182182

183183
@pytest.mark.continuous_testing
184184
def test_tuning_mxnet(sagemaker_session):
185-
with timeout(minutes=15):
185+
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
186186
script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py')
187187
data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
188188

@@ -219,7 +219,7 @@ def test_tuning_mxnet(sagemaker_session):
219219

220220
@pytest.mark.continuous_testing
221221
def test_tuning_tf(sagemaker_session):
222-
with timeout(minutes=15):
222+
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
223223
script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')
224224

225225
estimator = TensorFlow(entry_point=script_path,
@@ -263,7 +263,7 @@ def test_tuning_tf(sagemaker_session):
263263

264264
@pytest.mark.continuous_testing
265265
def test_tuning_chainer(sagemaker_session):
266-
with timeout(minutes=15):
266+
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
267267
script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
268268
data_path = os.path.join(DATA_DIR, 'chainer_mnist')
269269

@@ -321,7 +321,7 @@ def test_attach_tuning_pytorch(sagemaker_session):
321321
estimator = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=1,
322322
train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session)
323323

324-
with timeout(minutes=15):
324+
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
325325
objective_metric_name = 'evaluation-accuracy'
326326
metric_definitions = [{'Name': 'evaluation-accuracy', 'Regex': 'Overall test accuracy: (\d+)'}]
327327
hyperparameter_ranges = {'batch-size': IntegerParameter(50, 100)}
@@ -368,7 +368,7 @@ def test_tuning_byo_estimator(sagemaker_session):
368368
image_name = registry(sagemaker_session.boto_session.region_name) + '/factorization-machines:1'
369369
training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
370370

371-
with timeout(minutes=15):
371+
with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
372372
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
373373
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
374374

0 commit comments

Comments
 (0)