Skip to content

feature: pytorch 1.3.1 eia support #1328

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 6, 2020
6 changes: 4 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ TensorFlow SageMaker Estimators

By using TensorFlow SageMaker Estimators, you can train and host TensorFlow models on Amazon SageMaker.

Supported versions of TensorFlow: ``1.4.1``, ``1.5.0``, ``1.6.0``, ``1.7.0``, ``1.8.0``, ``1.9.0``, ``1.10.0``, ``1.11.0``, ``1.12.0``, ``1.13.1``, ``1.14.0``, ``1.15.0``, ``2.0.0``.
Supported versions of TensorFlow: ``1.4.1``, ``1.5.0``, ``1.6.0``, ``1.7.0``, ``1.8.0``, ``1.9.0``, ``1.10.0``, ``1.11.0``, ``1.12.0``, ``1.13.1``, ``1.14.0``, ``1.15.0``, ``1.15.2``, ``2.0.0``, ``2.0.1``, ``2.1.0``.

Supported versions of TensorFlow for Elastic Inference: ``1.11.0``, ``1.12.0``, ``1.13.1``, ``1.14.0``.

Expand Down Expand Up @@ -208,7 +208,9 @@ PyTorch SageMaker Estimators

With PyTorch SageMaker Estimators, you can train and host PyTorch models on Amazon SageMaker.

Supported versions of PyTorch: ``0.4.0``, ``1.0.0``, ``1.1.0``, ``1.2.0``, ``1.3.1``.
Supported versions of PyTorch: ``0.4.0``, ``1.0.0``, ``1.1.0``, ``1.2.0``, ``1.3.1``, ``1.4.0``.

Supported versions of PyTorch for Elastic Inference: ``1.3.1``.

We recommend that you use the latest supported version, because that's where we focus most of our development efforts.

Expand Down
15 changes: 15 additions & 0 deletions doc/using_pytorch.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ With PyTorch Estimators and Models, you can train and host PyTorch models on Ama

Supported versions of PyTorch: ``0.4.0``, ``1.0.0``, ``1.1.0``, ``1.2.0``, ``1.3.1``.

Supported versions of PyTorch for Elastic Inference: ``1.3.1``.

We recommend that you use the latest supported version, because that's where we focus most of our development efforts.

You can visit the PyTorch repository at https://github.com/pytorch/pytorch.
Expand Down Expand Up @@ -250,6 +252,14 @@ You use the SageMaker PyTorch model server to host your PyTorch model when you c
Estimator. The model server runs inside a SageMaker Endpoint, which your call to ``deploy`` creates.
You can access the name of the Endpoint by the ``name`` property on the returned ``Predictor``.

PyTorch on Amazon SageMaker has support for `Elastic Inference <https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html>`_, which allows for inference acceleration to a hosted endpoint for a fraction of the cost of using a full GPU instance.
In order to attach an Elastic Inference accelerator to your endpoint provide the accelerator type to ``accelerator_type`` to your ``deploy`` call.

.. code:: python

predictor = pytorch_estimator.deploy(instance_type='ml.m4.xlarge',
initial_instance_count=1,
accelerator_type='ml.eia2.medium')

The SageMaker PyTorch Model Server
==================================
Expand Down Expand Up @@ -291,6 +301,11 @@ It loads the model parameters from a ``model.pth`` file in the SageMaker model d
model.load_state_dict(torch.load(f))
return model

However, if you are using PyTorch Elastic Inference, you do not have to provide a ``model_fn`` since the PyTorch serving
container has a default one for you. But please note that if you are utilizing the default ``model_fn``, please save
yor parameter file as ``model.pt`` instead of ``model.pth``. For more information on inference script, please refer to:
`SageMaker PyTorch Default Inference Handler <https://github.com/aws/sagemaker-pytorch-serving-container/blob/master/src/sagemaker_pytorch_serving_container/default_inference_handler.py>`_.

Serve a PyTorch Model
---------------------

Expand Down
22 changes: 19 additions & 3 deletions src/sagemaker/fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,14 @@
)

VALID_PY_VERSIONS = ["py2", "py3"]
VALID_EIA_FRAMEWORKS = ["tensorflow", "tensorflow-serving", "mxnet", "mxnet-serving"]
VALID_EIA_FRAMEWORKS = [
"tensorflow",
"tensorflow-serving",
"mxnet",
"mxnet-serving",
"pytorch-serving",
]
PY2_RESTRICTED_EIA_FRAMEWORKS = ["pytorch-serving"]
VALID_ACCOUNTS_BY_REGION = {"us-gov-west-1": "246785580436", "us-iso-east-1": "744548109606"}
ASIMOV_VALID_ACCOUNTS_BY_REGION = {"us-iso-east-1": "886529160074"}
OPT_IN_ACCOUNTS_BY_REGION = {"ap-east-1": "057415533634", "me-south-1": "724002660598"}
Expand All @@ -71,6 +78,7 @@
"mxnet-serving-eia": "mxnet-inference-eia",
"pytorch": "pytorch-training",
"pytorch-serving": "pytorch-inference",
"pytorch-serving-eia": "pytorch-inference-eia",
}

MERGED_FRAMEWORKS_LOWEST_VERSIONS = {
Expand All @@ -82,6 +90,7 @@
"mxnet-serving-eia": [1, 4, 1],
"pytorch": [1, 2, 0],
"pytorch-serving": [1, 2, 0],
"pytorch-serving-eia": [1, 3, 1],
}

DEBUGGER_UNSUPPORTED_REGIONS = ["us-gov-west-1", "us-iso-east-1"]
Expand Down Expand Up @@ -207,6 +216,7 @@ def create_image_uri(

if _accelerator_type_valid_for_framework(
framework=framework,
py_version=py_version,
accelerator_type=accelerator_type,
optimized_families=optimized_families,
):
Expand Down Expand Up @@ -259,21 +269,27 @@ def create_image_uri(


def _accelerator_type_valid_for_framework(
framework, accelerator_type=None, optimized_families=None
framework, py_version, accelerator_type=None, optimized_families=None
):
"""
Args:
framework:
py_version:
accelerator_type:
optimized_families:
"""
if accelerator_type is None:
return False

if py_version == "py2" and framework in PY2_RESTRICTED_EIA_FRAMEWORKS:
raise ValueError(
"{} is not supported with Amazon Elastic Inference in Python 2.".format(framework)
)

if framework not in VALID_EIA_FRAMEWORKS:
raise ValueError(
"{} is not supported with Amazon Elastic Inference. Currently only "
"Python-based TensorFlow and MXNet are supported.".format(framework)
"Python-based TensorFlow, MXNet, PyTorch are supported.".format(framework)
)

if optimized_families:
Expand Down
2 changes: 2 additions & 0 deletions src/sagemaker/pytorch/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ With PyTorch Estimators and Models, you can train and host PyTorch models on Ama

Supported versions of PyTorch: ``0.4.0``, ``1.0.0``, ``1.1.0``, ``1.2.0``, ``1.3.1``, ``1.4.0``.

Supported versions of PyTorch for Elastic Inference: ``1.3.1``.

We recommend that you use the latest supported version, because that's where we focus most of our development efforts.

You can visit the PyTorch repository at https://github.com/pytorch/pytorch.
Expand Down
4 changes: 2 additions & 2 deletions src/sagemaker/pytorch/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def prepare_container_def(self, instance_type, accelerator_type=None):
For example, 'ml.p2.xlarge'.
accelerator_type (str): The Elastic Inference accelerator type to
deploy to the instance for loading and making inferences to the
model. Currently unsupported with PyTorch.
model.

Returns:
dict[str, str]: A container definition object usable with the
Expand Down Expand Up @@ -169,7 +169,7 @@ def serving_image_uri(self, region_name, instance_type, accelerator_type=None):
(cpu/gpu/family-specific optimized).
accelerator_type (str): The Elastic Inference accelerator type to
deploy to the instance for loading and making inferences to the
model. Currently unsupported with PyTorch.
model.

Returns:
str: The appropriate image URI based on the given parameters.
Expand Down
13 changes: 13 additions & 0 deletions tests/data/pytorch_eia/empty_inference_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
# This file is intentionally left blank to invoke default model_fn and predict_fn
Binary file added tests/data/pytorch_eia/model_mnist.tar.gz
Binary file not shown.
30 changes: 30 additions & 0 deletions tests/integ/test_pytorch_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
MNIST_DIR = os.path.join(DATA_DIR, "pytorch_mnist")
MNIST_SCRIPT = os.path.join(MNIST_DIR, "mnist.py")

EIA_DIR = os.path.join(DATA_DIR, "pytorch_eia")
EIA_MODEL = os.path.join(EIA_DIR, "model_mnist.tar.gz")
EIA_SCRIPT = os.path.join(EIA_DIR, "empty_inference_script.py")


@pytest.fixture(scope="module", name="pytorch_training_job")
def fixture_training_job(sagemaker_session, pytorch_full_version, cpu_instance_type):
Expand Down Expand Up @@ -115,6 +119,32 @@ def test_deploy_model(pytorch_training_job, sagemaker_session, cpu_instance_type
assert output.shape == (batch_size, 10)


@pytest.mark.skipif(PYTHON_VERSION == "py2", reason="PyTorch EIA does not support Python 2.")
def test_deploy_model_with_accelerator(sagemaker_session, cpu_instance_type):
endpoint_name = "test-pytorch-deploy-eia-{}".format(sagemaker_timestamp())
model_data = sagemaker_session.upload_data(path=EIA_MODEL)
pytorch = PyTorchModel(
model_data,
"SageMakerRole",
framework_version="1.3.1",
entry_point=EIA_SCRIPT,
sagemaker_session=sagemaker_session,
)
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
predictor = pytorch.deploy(
initial_instance_count=1,
instance_type=cpu_instance_type,
accelerator_type="ml.eia2.medium",
endpoint_name=endpoint_name,
)

batch_size = 100
data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32)
output = predictor.predict(data)

assert output.shape == (batch_size, 10)


def _upload_training_data(pytorch):
return pytorch.sagemaker_session.upload_data(
path=os.path.join(MNIST_DIR, "training"),
Expand Down
31 changes: 31 additions & 0 deletions tests/unit/test_fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,37 @@ def test_mxnet_eia_images():
)


def test_pytorch_eia_images():
image_uri = fw_utils.create_image_uri(
"us-east-1",
"pytorch-serving",
"ml.c4.2xlarge",
"1.3.1",
"py3",
accelerator_type="ml.eia1.large",
)
assert (
image_uri
== "{}.dkr.ecr.us-east-1.amazonaws.com/pytorch-inference-eia:1.3.1-cpu-py3".format(
fw_utils.ASIMOV_PROD_ACCOUNT
)
)


def test_pytorch_eia_py2_error():
error_message = "pytorch-serving is not supported with Amazon Elastic Inference in Python 2."
with pytest.raises(ValueError) as error:
fw_utils.create_image_uri(
"us-east-1",
"pytorch-serving",
"ml.c4.2xlarge",
"1.3.1",
"py2",
accelerator_type="ml.eia1.large",
)
assert error_message in str(error)


def test_create_image_uri_override_account():
image_uri = fw_utils.create_image_uri(
"us-west-1", MOCK_FRAMEWORK, "ml.p3.2xlarge", "1.0rc", "py3", account="fake"
Expand Down
16 changes: 12 additions & 4 deletions tests/unit/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,11 +345,19 @@ def test_non_mms_model(repack_model, sagemaker_session):

@patch("sagemaker.fw_utils.tar_and_upload_dir", MagicMock())
def test_model_image_accelerator(sagemaker_session):
model = PyTorchModel(
MODEL_DATA, role=ROLE, entry_point=SCRIPT_PATH, sagemaker_session=sagemaker_session
with pytest.raises(ValueError) as error:
model = PyTorchModel(
MODEL_DATA,
role=ROLE,
entry_point=SCRIPT_PATH,
sagemaker_session=sagemaker_session,
framework_version="1.3.1",
py_version="py2",
)
model.deploy(1, CPU, accelerator_type=ACCELERATOR_TYPE)
assert "pytorch-serving is not supported with Amazon Elastic Inference in Python 2." in str(
error
)
with pytest.raises(ValueError):
model.prepare_container_def(INSTANCE_TYPE, accelerator_type=ACCELERATOR_TYPE)


def test_train_image_default(sagemaker_session):
Expand Down