Skip to content

breaking: Change Model server to Torchserve for PyTorch Inference #79

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Jul 31, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
8225190
Change Model server tp Torchserve for PytTorch Inference
dk19y Jul 10, 2020
900b406
PR Feedback - Replace MMS with TS
dk19y Jul 15, 2020
9bb211a
Fixed flake errors
dk19y Jul 17, 2020
1abf0e3
Fix env module usage from sagemaker-inference-toolkit
dk19y Jul 17, 2020
cf38f47
Fix unit test failures
dk19y Jul 17, 2020
8d51e40
PR Feedback
dk19y Jul 27, 2020
d16ed07
Fix flakes issues
dk19y Jul 27, 2020
a4eee3e
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 27, 2020
0ae09ff
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 27, 2020
f0629ae
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 27, 2020
270489b
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 27, 2020
299d6cb
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 27, 2020
ed641f8
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 27, 2020
292cf5a
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 27, 2020
19f0da0
PR Feedback
dk19y Jul 27, 2020
9ccc715
Update test/unit/test_model_server.py
dk19y Jul 27, 2020
b565428
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 29, 2020
9262b48
PR Feedback - Unit Test
dk19y Jul 29, 2020
2be5474
Update test_model_server.py
dk19y Jul 29, 2020
22567d0
Update test_model_server.py
dk19y Jul 29, 2020
5dada25
Update framework version to 1.6.0
dk19y Jul 29, 2020
9cc1574
Update DLC Containers
dk19y Jul 29, 2020
b0e1d5f
Update version in buildspec for Codebuild
dk19y Jul 30, 2020
07556d6
Update 1.5.0 Images for CodeBuild
dk19y Jul 30, 2020
b37fcd3
Skip EIA Test for 1.5.0
dk19y Jul 30, 2020
5d48150
Skipped EIA Test
dk19y Jul 31, 2020
811e171
Retrigger build
dk19y Jul 31, 2020
c2cc29e
Update src/sagemaker_pytorch_serving_container/torchserve.py
dk19y Jul 31, 2020
6ab7850
Update Logger
dk19y Jul 31, 2020
0dd3199
Update logeger
dk19y Jul 31, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def read(fname):

packages=find_packages(where='src', exclude=('test',)),
package_dir={'': 'src'},
package_data={'': ["etc/*"]},
py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')],

long_description=read('README.rst'),
Expand All @@ -56,7 +57,7 @@ def read(fname):
'test': ['boto3==1.10.32', 'coverage==4.5.3', 'docker-compose==1.23.2', 'flake8==3.7.7', 'Flask==1.1.1',
'mock==2.0.0', 'pytest==4.4.0', 'pytest-cov==2.7.1', 'pytest-xdist==1.28.0', 'PyYAML==3.10',
'sagemaker==1.56.3', 'sagemaker-containers>=2.5.4', 'six==1.12.0', 'requests==2.20.0',
'requests_mock==1.6.0', 'torch==1.5.0', 'torchvision==0.6.0', 'tox==3.7.0']
'requests_mock==1.6.0', 'torch==1.6.0', 'torchvision==0.7.0', 'tox==3.7.0']
},

entry_points={
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Based on https://github.com/pytorch/serve/blob/master/docs/configuration.md
enable_envvars_config=true
decode_input_request=false
load_models=ALL
50 changes: 50 additions & 0 deletions src/sagemaker_pytorch_serving_container/etc/log4j.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
log4j.rootLogger = INFO, console

log4j.appender.console = org.apache.log4j.ConsoleAppender
log4j.appender.console.Target = System.out
log4j.appender.console.layout = org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern = %d{ISO8601} [%-5p] %t %c - %m%n

log4j.appender.access_log = org.apache.log4j.RollingFileAppender
log4j.appender.access_log.File = ${LOG_LOCATION}/access_log.log
log4j.appender.access_log.MaxFileSize = 10MB
log4j.appender.access_log.MaxBackupIndex = 5
log4j.appender.access_log.layout = org.apache.log4j.PatternLayout
log4j.appender.access_log.layout.ConversionPattern = %d{ISO8601} - %m%n

log4j.appender.ts_log = org.apache.log4j.RollingFileAppender
log4j.appender.ts_log.File = ${LOG_LOCATION}/ts_log.log
log4j.appender.ts_log.MaxFileSize = 10MB
log4j.appender.ts_log.MaxBackupIndex = 5
log4j.appender.ts_log.layout = org.apache.log4j.PatternLayout
log4j.appender.ts_log.layout.ConversionPattern = %d{ISO8601} [%-5p] %t %c - %m%n

log4j.appender.ts_metrics = org.apache.log4j.RollingFileAppender
log4j.appender.ts_metrics.File = ${METRICS_LOCATION}/ts_metrics.log
log4j.appender.ts_metrics.MaxFileSize = 10MB
log4j.appender.ts_metrics.MaxBackupIndex = 5
log4j.appender.ts_metrics.layout = org.apache.log4j.PatternLayout
log4j.appender.ts_metrics.layout.ConversionPattern = %d{ISO8601} - %m%n

log4j.appender.model_log = org.apache.log4j.RollingFileAppender
log4j.appender.model_log.File = ${LOG_LOCATION}/model_log.log
log4j.appender.model_log.MaxFileSize = 10MB
log4j.appender.model_log.MaxBackupIndex = 5
log4j.appender.model_log.layout = org.apache.log4j.PatternLayout
log4j.appender.model_log.layout.ConversionPattern = %d{ISO8601} [%-5p] %c - %m%n

log4j.appender.model_metrics = org.apache.log4j.RollingFileAppender
log4j.appender.model_metrics.File = ${METRICS_LOCATION}/model_metrics.log
log4j.appender.model_metrics.MaxFileSize = 10MB
log4j.appender.model_metrics.MaxBackupIndex = 5
log4j.appender.model_metrics.layout = org.apache.log4j.PatternLayout
log4j.appender.model_metrics.layout.ConversionPattern = %d{ISO8601} - %m%n

log4j.logger.com.amazonaws.ml.ts = INFO, ts_log
log4j.logger.ACCESS_LOG = INFO, access_log
log4j.logger.TS_METRICS = INFO, ts_metrics
log4j.logger.MODEL_METRICS = INFO, model_metrics
log4j.logger.MODEL_LOG = INFO, model_log

log4j.logger.org.apache = OFF
log4j.logger.io.netty = ERROR
3 changes: 1 addition & 2 deletions src/sagemaker_pytorch_serving_container/handler_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@

from sagemaker_inference.default_handler_service import DefaultHandlerService
from sagemaker_inference.transformer import Transformer
from sagemaker_pytorch_serving_container.default_inference_handler import \
DefaultPytorchInferenceHandler
from sagemaker_pytorch_serving_container.default_pytorch_inference_handler import DefaultPytorchInferenceHandler

import os
import sys
Expand Down
11 changes: 5 additions & 6 deletions src/sagemaker_pytorch_serving_container/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@
from subprocess import CalledProcessError

from retrying import retry
from sagemaker_inference import model_server

from sagemaker_pytorch_serving_container import torchserve
from sagemaker_pytorch_serving_container import handler_service

HANDLER_SERVICE = handler_service.__name__
HANDLER_SERVICE = handler_service.__file__


def _retry_if_error(exception):
Expand All @@ -28,12 +27,12 @@ def _retry_if_error(exception):

@retry(stop_max_delay=1000 * 30,
retry_on_exception=_retry_if_error)
def _start_model_server():
def _start_torchserve():
# there's a race condition that causes the model server command to
# sometimes fail with 'bad address'. more investigation needed
# retry starting mms until it's ready
model_server.start_model_server(handler_service=HANDLER_SERVICE)
torchserve.start_torchserve(handler_service=HANDLER_SERVICE)


def main():
_start_model_server()
_start_torchserve()
213 changes: 213 additions & 0 deletions src/sagemaker_pytorch_serving_container/torchserve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
"""This module contains functionality to configure and start Torchserve."""
from __future__ import absolute_import

import os
import signal
import subprocess
import sys

import pkg_resources
import psutil
import logging
from retrying import retry

import sagemaker_pytorch_serving_container
from sagemaker_inference import default_handler_service, environment, utils
from sagemaker_inference.environment import code_dir

logger = logging.getLogger()

TS_CONFIG_FILE = os.path.join("/etc", "sagemaker-ts.properties")
DEFAULT_HANDLER_SERVICE = default_handler_service.__name__
DEFAULT_TS_CONFIG_FILE = pkg_resources.resource_filename(
sagemaker_pytorch_serving_container.__name__, "/etc/default-ts.properties"
)
MME_TS_CONFIG_FILE = pkg_resources.resource_filename(
sagemaker_pytorch_serving_container.__name__, "/etc/mme-ts.properties"
)
DEFAULT_TS_LOG_FILE = pkg_resources.resource_filename(
sagemaker_pytorch_serving_container.__name__, "/etc/log4j.properties"
)
DEFAULT_TS_MODEL_DIRECTORY = os.path.join(os.getcwd(), ".sagemaker", "ts", "models")
DEFAULT_TS_MODEL_NAME = "model"
DEFAULT_TS_MODEL_SERIALIZED_FILE = "model.pth"
DEFAULT_HANDLER_SERVICE = "sagemaker_pytorch_serving_container.handler_service"

ENABLE_MULTI_MODEL = os.getenv("SAGEMAKER_MULTI_MODEL", "false") == "true"
MODEL_STORE = "/" if ENABLE_MULTI_MODEL else DEFAULT_TS_MODEL_DIRECTORY

PYTHON_PATH_ENV = "PYTHONPATH"
REQUIREMENTS_PATH = os.path.join(code_dir, "requirements.txt")
TS_NAMESPACE = "org.pytorch.serve.ModelServer"


def start_torchserve(handler_service=DEFAULT_HANDLER_SERVICE):
"""Configure and start the model server.

Args:
handler_service (str): Python path pointing to a module that defines
a class with the following:

- A ``handle`` method, which is invoked for all incoming inference
requests to the model server.
- A ``initialize`` method, which is invoked at model server start up
for loading the model.

Defaults to ``sagemaker_pytorch_serving_container.default_handler_service``.

"""

if ENABLE_MULTI_MODEL:
if "SAGEMAKER_HANDLER" not in os.environ:
os.environ["SAGEMAKER_HANDLER"] = handler_service
_set_python_path()
else:
_adapt_to_ts_format(handler_service)

_create_torchserve_config_file()

if os.path.exists(REQUIREMENTS_PATH):
_install_requirements()

ts_torchserve_cmd = [
"torchserve",
"--start",
"--model-store",
MODEL_STORE,
"--ts-config",
TS_CONFIG_FILE,
"--log-config",
DEFAULT_TS_LOG_FILE,
"--models",
"model.mar"
]

print(ts_torchserve_cmd)

logger.info(ts_torchserve_cmd)
subprocess.Popen(ts_torchserve_cmd)

ts_process = _retrieve_ts_server_process()

_add_sigterm_handler(ts_process)

ts_process.wait()


def _adapt_to_ts_format(handler_service):
if not os.path.exists(DEFAULT_TS_MODEL_DIRECTORY):
os.makedirs(DEFAULT_TS_MODEL_DIRECTORY)

model_archiver_cmd = [
"torch-model-archiver",
"--model-name",
DEFAULT_TS_MODEL_NAME,
"--handler",
handler_service,
"--serialized-file",
os.path.join(environment.model_dir, DEFAULT_TS_MODEL_SERIALIZED_FILE),
"--export-path",
DEFAULT_TS_MODEL_DIRECTORY,
"--extra-files",
os.path.join(environment.model_dir, environment.Environment().module_name + ".py"),
"--version",
"1",
]

logger.info(model_archiver_cmd)
subprocess.check_call(model_archiver_cmd)

_set_python_path()


def _set_python_path():
# Torchserve handles code execution by appending the export path, provided
# to the model archiver, to the PYTHONPATH env var.
# The code_dir has to be added to the PYTHONPATH otherwise the
# user provided module can not be imported properly.
if PYTHON_PATH_ENV in os.environ:
os.environ[PYTHON_PATH_ENV] = "{}:{}".format(environment.code_dir, os.environ[PYTHON_PATH_ENV])
else:
os.environ[PYTHON_PATH_ENV] = environment.code_dir


def _create_torchserve_config_file():
configuration_properties = _generate_ts_config_properties()

utils.write_file(TS_CONFIG_FILE, configuration_properties)


def _generate_ts_config_properties():
env = environment.Environment()

user_defined_configuration = {
"default_response_timeout": env.model_server_timeout,
"default_workers_per_model": env.model_server_workers,
"inference_address": "http://0.0.0.0:{}".format(env.inference_http_port),
"management_address": "http://0.0.0.0:{}".format(env.management_http_port),
}

custom_configuration = str()

for key in user_defined_configuration:
value = user_defined_configuration.get(key)
if value:
custom_configuration += "{}={}\n".format(key, value)

if ENABLE_MULTI_MODEL:
default_configuration = utils.read_file(MME_TS_CONFIG_FILE)
else:
default_configuration = utils.read_file(DEFAULT_TS_CONFIG_FILE)

return default_configuration + custom_configuration


def _add_sigterm_handler(ts_process):
def _terminate(signo, frame): # pylint: disable=unused-argument
try:
os.kill(ts_process.pid, signal.SIGTERM)
except OSError:
pass

signal.signal(signal.SIGTERM, _terminate)


def _install_requirements():
logger.info("installing packages from requirements.txt...")
pip_install_cmd = [sys.executable, "-m", "pip", "install", "-r", REQUIREMENTS_PATH]

try:
subprocess.check_call(pip_install_cmd)
except subprocess.CalledProcessError:
logger.exception("failed to install required packages, exiting")
raise ValueError("failed to install required packages")


# retry for 10 seconds
@retry(stop_max_delay=10 * 1000)
def _retrieve_ts_server_process():
ts_server_processes = list()

for process in psutil.process_iter():
if TS_NAMESPACE in process.cmdline():
ts_server_processes.append(process)

if not ts_server_processes:
raise Exception("Torchserve model server was unsuccessfully started")

if len(ts_server_processes) > 1:
raise Exception("multiple ts model servers are not supported")

return ts_server_processes[0]
2 changes: 1 addition & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def pytest_addoption(parser):
parser.addoption('--accelerator-type')
parser.addoption('--docker-base-name', default='sagemaker-pytorch-inference')
parser.addoption('--region', default='us-west-2')
parser.addoption('--framework-version', default="1.5.0")
parser.addoption('--framework-version', default="1.6.0")
parser.addoption('--py-version', choices=['2', '3'], default='3')
# Processor is still "cpu" for EIA tests
parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu')
Expand Down
12 changes: 12 additions & 0 deletions test/container/1.5.0/Dockerfile.dlc.cpu
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.5.0-cpu-py3

ARG TS_VERSION=0.1.1
RUN apt-get update \
&& apt-get install -y --no-install-recommends software-properties-common \
&& add-apt-repository ppa:openjdk-r/ppa \
&& apt-get update \
&& apt-get install -y --no-install-recommends openjdk-11-jdk

RUN pip install torchserve==$TS_VERSION \
&& pip install torch-model-archiver==$TS_VERSION

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz

CMD ["torchserve", "--start", "--ts-config", "/home/model-server/config.properties", "--model-store", "/home/model-server/"]
14 changes: 13 additions & 1 deletion test/container/1.5.0/Dockerfile.dlc.gpu
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.5.0-gpu-py3
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.5.0-cpu-py3

ARG TS_VERSION=0.1.1
RUN apt-get update \
&& apt-get install -y --no-install-recommends software-properties-common \
&& add-apt-repository ppa:openjdk-r/ppa \
&& apt-get update \
&& apt-get install -y --no-install-recommends openjdk-11-jdk

RUN pip install torchserve==$TS_VERSION \
&& pip install torch-model-archiver==$TS_VERSION

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz

CMD ["torchserve", "--start", "--ts-config", "/home/model-server/config.properties", "--model-store", "/home/model-server/"]
Loading