Merge pull request aws#4 from aws/pytorch-prediction

nadiaya · web-flow · commit e2334a98e172 · 2018-04-29T20:32:06.000-07:00
Add integ tests for pytorch prediction.
diff --git a/tests/data/pytorch_mnist/mnist.py b/tests/data/pytorch_mnist/mnist.py
@@ -163,3 +163,10 @@ def test(model, test_loader, cuda):
     logger.debug('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
         test_loss, correct, len(test_loader.dataset),
         100. * correct / len(test_loader.dataset)))
+
+
+def model_fn(model_dir):
+    model = torch.nn.DataParallel(Net())
+    with open(os.path.join(model_dir, 'model'), 'rb') as f:
+        model.load_state_dict(torch.load(f))
+    return model
diff --git a/tests/integ/test_pytorch_train.py b/tests/integ/test_pytorch_train.py
@@ -10,55 +10,71 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
+import numpy
 import os
+import sys
 import time
 import pytest
 from sagemaker.pytorch.estimator import PyTorch
+from sagemaker.pytorch.model import PyTorchModel
+from sagemaker.utils import sagemaker_timestamp
 from tests.integ import DATA_DIR
-from tests.integ.timeout import timeout
+from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name
 
 MNIST_DIR = os.path.join(DATA_DIR, 'pytorch_mnist')
 MNIST_SCRIPT = os.path.join(MNIST_DIR, 'mnist.py')
+PYTHON_VERSION = 'py' + str(sys.version_info.major)
 
 
 @pytest.fixture(scope='module', name='pytorch_training_job')
-def fixture_training_job(sagemaker_session, pytorch_full_version, instance_type):
+def fixture_training_job(sagemaker_session, pytorch_full_version):
+    instance_type = 'ml.c4.xlarge'
     with timeout(minutes=15):
-        pytorch = PyTorch(entry_point=MNIST_SCRIPT, role='SageMakerRole', framework_version=pytorch_full_version,
-                          train_instance_count=1, train_instance_type=instance_type,
-                          sagemaker_session=sagemaker_session)
+        pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type)
 
         pytorch.fit({'training': _upload_training_data(pytorch)})
         return pytorch.latest_training_job.name
 
 
-def test_sync_fit(sagemaker_session, pytorch_full_version):
-    training_job_name = ""
+def test_sync_fit_deploy(pytorch_training_job, sagemaker_session):
     # TODO: add tests against local mode when it's ready to be used
-    instance_type = 'ml.p2.xlarge'
+    endpoint_name = 'test-pytorch-sync-fit-attach-deploy{}'.format(sagemaker_timestamp())
+    with timeout(minutes=20):
+        estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session)
+        predictor = estimator.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
+        data = numpy.zeros(shape=(1, 1, 28, 28))
+        predictor.predict(data)
 
-    with timeout(minutes=15):
-        pytorch = PyTorch(entry_point=MNIST_SCRIPT, role='SageMakerRole', framework_version=pytorch_full_version,
-                          train_instance_count=1, train_instance_type=instance_type,
-                          sagemaker_session=sagemaker_session)
+        batch_size = 100
+        data = numpy.random.rand(batch_size, 1, 28, 28)
+        output = predictor.predict(data)
 
-        pytorch.fit({'training': _upload_training_data(pytorch)})
-        training_job_name = pytorch.latest_training_job.name
+        assert numpy.asarray(output).shape == (batch_size, 10)
 
-    if not _is_local_mode(instance_type):
-        with timeout(minutes=20):
-            PyTorch.attach(training_job_name, sagemaker_session=sagemaker_session)
 
+def test_deploy_model(pytorch_training_job, sagemaker_session):
+    endpoint_name = 'test-pytorch-deploy-model-{}'.format(sagemaker_timestamp())
+
+    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
+        desc = sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=pytorch_training_job)
+        model_data = desc['ModelArtifacts']['S3ModelArtifacts']
+        model = PyTorchModel(model_data, 'SageMakerRole', entry_point=MNIST_SCRIPT, sagemaker_session=sagemaker_session)
+        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
+
+        batch_size = 100
+        data = numpy.random.rand(batch_size, 1, 28, 28)
+        output = predictor.predict(data)
 
-def test_async_fit(sagemaker_session, pytorch_full_version):
+        assert numpy.asarray(output).shape == (batch_size, 10)
+
+
+def test_async_fit_deploy(sagemaker_session, pytorch_full_version):
     training_job_name = ""
     # TODO: add tests against local mode when it's ready to be used
-    instance_type = 'ml.c4.xlarge'
+    instance_type = 'ml.p2.xlarge'
 
     with timeout(minutes=10):
-        pytorch = PyTorch(entry_point=MNIST_SCRIPT, role='SageMakerRole', framework_version=pytorch_full_version,
-                          train_instance_count=1, train_instance_type=instance_type,
-                          sagemaker_session=sagemaker_session)
+        pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type)
 
         pytorch.fit({'training': _upload_training_data(pytorch)}, wait=False)
         training_job_name = pytorch.latest_training_job.name
@@ -67,19 +83,26 @@ def test_async_fit(sagemaker_session, pytorch_full_version):
         time.sleep(20)
 
     if not _is_local_mode(instance_type):
-        with timeout(minutes=35):
+        endpoint_name = 'test-pytorch-async-fit-attach-deploy-{}'.format(sagemaker_timestamp())
+
+        with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
             print("Re-attaching now to: %s" % training_job_name)
-            PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
+            estimator = PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
+            predictor = estimator.deploy(1, instance_type, endpoint_name=endpoint_name)
+
+            batch_size = 100
+            data = numpy.random.rand(batch_size, 1, 28, 28)
+            output = predictor.predict(data)
+
+            assert numpy.asarray(output).shape == (batch_size, 10)
 
 
 # TODO(nadiaya): Run against local mode when errors will be propagated
 def test_failed_training_job(sagemaker_session, pytorch_full_version):
     script_path = os.path.join(MNIST_DIR, 'failure_script.py')
 
     with timeout(minutes=15):
-        pytorch = PyTorch(entry_point=script_path, role='SageMakerRole', framework_version=pytorch_full_version,
-                          train_instance_count=1, train_instance_type='ml.c4.xlarge',
-                          sagemaker_session=sagemaker_session)
+        pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, entry_point=script_path)
 
         with pytest.raises(ValueError) as e:
             pytorch.fit(_upload_training_data(pytorch))
@@ -91,5 +114,12 @@ def _upload_training_data(pytorch):
                                                  key_prefix='integ-test-data/pytorch_mnist/training')
 
 
+def _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type='ml.c4.xlarge',
+                           entry_point=MNIST_SCRIPT):
+    return PyTorch(entry_point=entry_point, role='SageMakerRole', framework_version=pytorch_full_version,
+                   py_version=PYTHON_VERSION, train_instance_count=1, train_instance_type=instance_type,
+                   sagemaker_session=sagemaker_session)
+
+
 def _is_local_mode(instance_type):
     return instance_type == 'local'