Skip to content

Commit a082319

Browse files
author
Ignacio Quintero
committed
Add factorization machines async fit integ test
Also fixed the timeouts for all the async fit integ tests. Previously we allowed 15 min timeout for training, and 20 min for hosting. With async fit the sections are split so we allow 5 min timeout for the intial fit call and setup. And then 35 min for the attach() + hosting calls. The total runtime is the same just split differently for async tests.
1 parent ec53250 commit a082319

6 files changed

+59
-16
lines changed

tests/integ/test_factorization_machines.py

+43
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import gzip
1414
import pickle
1515
import sys
16+
import time
1617

1718
import boto3
1819
import os
@@ -53,3 +54,45 @@ def test_factorization_machines():
5354
assert len(result) == 10
5455
for record in result:
5556
assert record.label["score"] is not None
57+
58+
59+
def test_async_factorization_machines():
60+
61+
training_job_name = ""
62+
endpoint_name = name_from_base('factorization_machines')
63+
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
64+
65+
with timeout(minutes=5):
66+
67+
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
68+
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
69+
70+
# Load the data into memory as numpy arrays
71+
with gzip.open(data_path, 'rb') as f:
72+
train_set, _, _ = pickle.load(f, **pickle_args)
73+
74+
fm = FactorizationMachines(role='SageMakerRole', train_instance_count=1,
75+
train_instance_type='ml.c4.xlarge',
76+
num_factors=10, predictor_type='regressor',
77+
epochs=2, clip_gradient=1e2, eps=0.001, rescale_grad=1.0 / 100,
78+
sagemaker_session=sagemaker_session, base_job_name='test-fm')
79+
80+
# training labels must be 'float32'
81+
fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')), wait=False)
82+
training_job_name = fm.latest_training_job.name
83+
84+
print("Detached from training job. Will re-attach in 20 seconds")
85+
time.sleep(20)
86+
print("attaching now...")
87+
88+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
89+
estimator = FactorizationMachines.attach(training_job_name=training_job_name,
90+
sagemaker_session=sagemaker_session)
91+
model = FactorizationMachinesModel(estimator.model_data, role='SageMakerRole',
92+
sagemaker_session=sagemaker_session)
93+
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
94+
result = predictor.predict(train_set[0][:10])
95+
96+
assert len(result) == 10
97+
for record in result:
98+
assert record.label["score"] is not None

tests/integ/test_kmeans.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,6 @@
2626

2727
import pytest
2828

29-
30-
@pytest.mark.skip(reason="no way of currently testing this")
3129
def test_kmeans():
3230

3331
with timeout(minutes=15):
@@ -71,7 +69,7 @@ def test_async_kmeans():
7169
training_job_name = ""
7270
endpoint_name = name_from_base('kmeans')
7371

74-
with timeout(minutes=15):
72+
with timeout(minutes=5):
7573
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
7674
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
7775
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
@@ -100,7 +98,7 @@ def test_async_kmeans():
10098
time.sleep(20)
10199
print("attaching now...")
102100

103-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
101+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
104102
estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
105103
model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
106104
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

tests/integ/test_linear_learner.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,10 @@
1414
import os
1515
import pickle
1616
import sys
17+
import time
1718
import pytest # noqa
1819
import boto3
1920
import numpy as np
20-
from datetime import time
2121

2222
import sagemaker
2323
from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel
@@ -88,13 +88,14 @@ def test_linear_learner():
8888
assert record.label["score"] is not None
8989

9090

91-
def test_async_linear_learner(sagemaker_session):
91+
def test_async_linear_learner():
9292

9393
training_job_name = ""
9494
endpoint_name = 'test-linear-learner-async-{}'.format(int(time.time()))
95+
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
96+
97+
with timeout(minutes=5):
9598

96-
with timeout(minutes=15):
97-
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
9899
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
99100
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
100101

@@ -144,7 +145,7 @@ def test_async_linear_learner(sagemaker_session):
144145
print("Waiting to re-attach to the training job: %s" % training_job_name)
145146
time.sleep(20)
146147

147-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
148+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
148149
estimator = LinearLearner.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
149150
model = LinearLearnerModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
150151
predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

tests/integ/test_mxnet_train.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def test_async_fit(sagemaker_session):
6363
training_job_name = ""
6464
endpoint_name = 'test-mxnet-attach-deploy-{}'.format(int(time.time()))
6565

66-
with timeout(minutes=15):
66+
with timeout(minutes=5):
6767
script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
6868
data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
6969

@@ -82,7 +82,7 @@ def test_async_fit(sagemaker_session):
8282
print("Waiting to re-attach to the training job: %s" % training_job_name)
8383
time.sleep(20)
8484

85-
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
85+
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
8686
print("Re-attaching now to: %s" % training_job_name)
8787
estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
8888
predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)

tests/integ/test_pca.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,11 @@ def test_pca():
6363
def test_async_pca():
6464

6565
training_job_name = ""
66-
endpoint_name = name_from_base('kmeans')
66+
endpoint_name = name_from_base('async_pca')
67+
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
68+
69+
with timeout(minutes=20):
6770

68-
with timeout(minutes=15):
69-
sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
7071
data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
7172
pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
7273

tests/integ/test_tf.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def test_tf(sagemaker_session):
5757
def test_tf_async(sagemaker_session):
5858

5959
training_job_name = ""
60-
with timeout(minutes=15):
60+
with timeout(minutes=5):
6161
script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')
6262

6363
estimator = TensorFlow(entry_point=script_path,
@@ -75,7 +75,7 @@ def test_tf_async(sagemaker_session):
7575
training_job_name = estimator.latest_training_job.name
7676
time.sleep(20)
7777

78-
with timeout_and_delete_endpoint(estimator=estimator, minutes=20):
78+
with timeout_and_delete_endpoint(estimator=estimator, minutes=35):
7979
estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
8080
json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')
8181

0 commit comments

Comments
 (0)