Add factorization machines async fit integ test

Ignacio Quintero · Ignacio Quintero · commit a082319d0a76 · 2018-01-25T14:13:33.000-08:00
Also fixed the timeouts for all the async fit integ tests.
Previously we allowed 15 min timeout for training, and 20 min for
hosting.

With async fit the sections are split so we allow 5 min timeout for the
intial fit call and setup. And then 35 min for the attach() + hosting
calls. The total runtime is the same just split  differently for async
tests.
diff --git a/tests/integ/test_factorization_machines.py b/tests/integ/test_factorization_machines.py
@@ -13,6 +13,7 @@
 import gzip
 import pickle
 import sys
+import time
 
 import boto3
 import os
@@ -53,3 +54,45 @@ def test_factorization_machines():
         assert len(result) == 10
         for record in result:
             assert record.label["score"] is not None
+
+
+def test_async_factorization_machines():
+
+    training_job_name = ""
+    endpoint_name = name_from_base('factorization_machines')
+    sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
+
+    with timeout(minutes=5):
+
+        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
+        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
+
+        # Load the data into memory as numpy arrays
+        with gzip.open(data_path, 'rb') as f:
+            train_set, _, _ = pickle.load(f, **pickle_args)
+
+        fm = FactorizationMachines(role='SageMakerRole', train_instance_count=1,
+                                   train_instance_type='ml.c4.xlarge',
+                                   num_factors=10, predictor_type='regressor',
+                                   epochs=2, clip_gradient=1e2, eps=0.001, rescale_grad=1.0 / 100,
+                                   sagemaker_session=sagemaker_session, base_job_name='test-fm')
+
+        # training labels must be 'float32'
+        fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')), wait=False)
+        training_job_name = fm.latest_training_job.name
+
+        print("Detached from training job. Will re-attach in 20 seconds")
+        time.sleep(20)
+        print("attaching now...")
+
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
+        estimator = FactorizationMachines.attach(training_job_name=training_job_name,
+                                                 sagemaker_session=sagemaker_session)
+        model = FactorizationMachinesModel(estimator.model_data, role='SageMakerRole',
+                                           sagemaker_session=sagemaker_session)
+        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
+        result = predictor.predict(train_set[0][:10])
+
+        assert len(result) == 10
+        for record in result:
+            assert record.label["score"] is not None
diff --git a/tests/integ/test_kmeans.py b/tests/integ/test_kmeans.py
@@ -26,8 +26,6 @@
 
 import pytest
 
-
-@pytest.mark.skip(reason="no way of currently testing this")
 def test_kmeans():
 
     with timeout(minutes=15):
@@ -71,7 +69,7 @@ def test_async_kmeans():
     training_job_name = ""
     endpoint_name = name_from_base('kmeans')
 
-    with timeout(minutes=15):
+    with timeout(minutes=5):
         sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
         data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
         pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
@@ -100,7 +98,7 @@ def test_async_kmeans():
         time.sleep(20)
         print("attaching now...")
 
-    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
         estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
         model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
         predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
diff --git a/tests/integ/test_linear_learner.py b/tests/integ/test_linear_learner.py
@@ -14,10 +14,10 @@
 import os
 import pickle
 import sys
+import time
 import pytest  # noqa
 import boto3
 import numpy as np
-from datetime import time
 
 import sagemaker
 from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel
@@ -88,13 +88,14 @@ def test_linear_learner():
             assert record.label["score"] is not None
 
 
-def test_async_linear_learner(sagemaker_session):
+def test_async_linear_learner():
 
     training_job_name = ""
     endpoint_name = 'test-linear-learner-async-{}'.format(int(time.time()))
+    sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
+
+    with timeout(minutes=5):
 
-    with timeout(minutes=15):
-        sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
         data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
         pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
 
@@ -144,7 +145,7 @@ def test_async_linear_learner(sagemaker_session):
         print("Waiting to re-attach to the training job: %s" % training_job_name)
         time.sleep(20)
 
-    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
         estimator = LinearLearner.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
         model = LinearLearnerModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
         predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
diff --git a/tests/integ/test_mxnet_train.py b/tests/integ/test_mxnet_train.py
@@ -63,7 +63,7 @@ def test_async_fit(sagemaker_session):
     training_job_name = ""
     endpoint_name = 'test-mxnet-attach-deploy-{}'.format(int(time.time()))
 
-    with timeout(minutes=15):
+    with timeout(minutes=5):
         script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
         data_path = os.path.join(DATA_DIR, 'mxnet_mnist')
 
@@ -82,7 +82,7 @@ def test_async_fit(sagemaker_session):
         print("Waiting to re-attach to the training job: %s" % training_job_name)
         time.sleep(20)
 
-    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=20):
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, minutes=35):
         print("Re-attaching now to: %s" % training_job_name)
         estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
         predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
diff --git a/tests/integ/test_pca.py b/tests/integ/test_pca.py
@@ -63,10 +63,11 @@ def test_pca():
 def test_async_pca():
 
     training_job_name = ""
-    endpoint_name = name_from_base('kmeans')
+    endpoint_name = name_from_base('async_pca')
+    sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
+
+    with timeout(minutes=20):
 
-    with timeout(minutes=15):
-        sagemaker_session = sagemaker.Session(boto_session=boto3.Session(region_name=REGION))
         data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
         pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}
 
diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py
@@ -57,7 +57,7 @@ def test_tf(sagemaker_session):
 def test_tf_async(sagemaker_session):
 
     training_job_name = ""
-    with timeout(minutes=15):
+    with timeout(minutes=5):
         script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')
 
         estimator = TensorFlow(entry_point=script_path,
@@ -75,7 +75,7 @@ def test_tf_async(sagemaker_session):
         training_job_name = estimator.latest_training_job.name
         time.sleep(20)
 
-    with timeout_and_delete_endpoint(estimator=estimator, minutes=20):
+    with timeout_and_delete_endpoint(estimator=estimator, minutes=35):
         estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
         json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge')