Skip to content

infra: refactor toolkit tests. #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Mar 17, 2020
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
application_import_names = image_utils, integration, local_mode_utils, sagemaker_pytorch_container, test, test-toolkit, timeout, utils
import-order-style = google
2 changes: 1 addition & 1 deletion buildspec-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ phases:
# run unit tests
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
tox -e py27,py36 -- test/unit
tox -e py36 -- test/unit

# run local integ tests
#- $(aws ecr get-login --no-include-email --region us-west-2)
Expand Down
102 changes: 102 additions & 0 deletions buildspec-toolkit.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
version: 0.2

env:
variables:
FRAMEWORK_VERSION: '1.4.0'
EIA_FRAMEWORK_VERSION: '1.3.1'
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
EIA_ACCELERATOR_TYPE: 'ml.eia2.medium'
ECR_REPO: 'sagemaker-test'
GITHUB_REPO: 'sagemaker-pytorch-serving-container'
DLC_ACCOUNT: '763104351884'
SETUP_FILE: 'setup_cmds.sh'
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]'

phases:
pre_build:
commands:
- start-dockerd
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'

build:
commands:
- TOX_PARALLEL_NO_SPINNER=1
- PY_COLORS=0

# install
- pip3 install -U -e .[test]

# run linters
- tox -e flake8,twine

# run unit tests
- tox -e py36 test-toolkit/unit

# define tags
- GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID"
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
- DLC_EIA_TAG="$FRAMEWORK_VERSION-dlc-eia-$BUILD_ID"

# run local CPU integration tests (build and push the image to ECR repo)
- test_cmd="pytest test-toolkit/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
- test_cmd="pytest test-toolkit/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"

# launch remote GPU instance
- prefix='ml.'
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
- create-key-pair
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest

# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
- python3 setup.py sdist
- build_dir="test-toolkit/docker/$FRAMEWORK_VERSION"
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
# push DLC GPU image to ECR
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG

# run GPU local integration tests
- printf "$SETUP_CMDS" > $SETUP_FILE
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
- generic_cmd="pytest test-toolkit/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
- dlc_cmd="pytest test-toolkit/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"

# run CPU sagemaker integration tests
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"

# run GPU sagemaker integration tests
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"

# run EIA sagemaker integration tests
- test_cmd="pytest test-toolkit/integration/sagemaker --build-image --push-image --dockerfile-type dlc.eia --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $EIA_FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --accelerator-type $EIA_ACCELERATOR_TYPE --tag $DLC_EIA_TAG"
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"

finally:
# shut down remote GPU instance
- cleanup-gpu-instances
- cleanup-key-pairs

# remove ECR image
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG
139 changes: 95 additions & 44 deletions test-toolkit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@
import platform
import pytest
import shutil
import sys
import tempfile

from sagemaker import LocalSession, Session
from sagemaker.pytorch import PyTorch

from test.utils import image_utils
from utils import image_utils

logger = logging.getLogger(__name__)
logging.getLogger('boto').setLevel(logging.INFO)
Expand All @@ -44,19 +42,36 @@


def pytest_addoption(parser):
parser.addoption('--build-image', '-D', action='store_true')
parser.addoption('--build-base-image', '-B', action='store_true')
parser.addoption('--aws-id')
parser.addoption('--build-image', '-B', action='store_true')
parser.addoption('--push-image', '-P', action='store_true')
parser.addoption('--dockerfile-type', '-T',
choices=['dlc.cpu', 'dlc.gpu', 'dlc.eia', 'pytorch'],
default='pytorch')
parser.addoption('--dockerfile', '-D', default=None)
parser.addoption('--aws-id', default=None)
parser.addoption('--instance-type')
parser.addoption('--docker-base-name', default='pytorch')
parser.addoption('--accelerator-type')
parser.addoption('--docker-base-name', default='sagemaker-pytorch-inference')
parser.addoption('--region', default='us-west-2')
parser.addoption('--framework-version', default=PyTorch.LATEST_VERSION)
parser.addoption('--py-version', choices=['2', '3'], default=str(sys.version_info.major))
parser.addoption('--framework-version', default="1.4.0")
parser.addoption('--py-version', choices=['2', '3'], default='3')
# Processor is still "cpu" for EIA tests
parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu')
# If not specified, will default to {framework-version}-{processor}-py{py-version}
parser.addoption('--tag', default=None)


@pytest.fixture(scope='session', name='dockerfile_type')
def fixture_dockerfile_type(request):
return request.config.getoption('--dockerfile-type')


@pytest.fixture(scope='session', name='dockerfile')
def fixture_dockerfile(request, dockerfile_type):
dockerfile = request.config.getoption('--dockerfile')
return dockerfile if dockerfile else 'Dockerfile.{}'.format(dockerfile_type)


@pytest.fixture(scope='session', name='docker_base_name')
def fixture_docker_base_name(request):
return request.config.getoption('--docker-base-name')
Expand Down Expand Up @@ -89,11 +104,6 @@ def fixture_tag(request, framework_version, processor, py_version):
return provided_tag if provided_tag else default_tag


@pytest.fixture(scope='session', name='docker_image')
def fixture_docker_image(docker_base_name, tag):
return '{}:{}'.format(docker_base_name, tag)


@pytest.fixture
def opt_ml():
tmp = tempfile.mkdtemp()
Expand All @@ -112,32 +122,25 @@ def fixture_use_gpu(processor):
return processor == 'gpu'


@pytest.fixture(scope='session', name='build_base_image', autouse=True)
def fixture_build_base_image(request, framework_version, py_version, processor, tag, docker_base_name):
build_base_image = request.config.getoption('--build-base-image')
if build_base_image:
return image_utils.build_base_image(framework_name=docker_base_name,
framework_version=framework_version,
py_version=py_version,
base_image_tag=tag,
processor=processor,
cwd=os.path.join(dir_path, '..'))

return tag


@pytest.fixture(scope='session', name='build_image', autouse=True)
def fixture_build_image(request, framework_version, py_version, processor, tag, docker_base_name):
def fixture_build_image(request, framework_version, dockerfile, image_uri, region):
build_image = request.config.getoption('--build-image')
if build_image:
return image_utils.build_image(framework_name=docker_base_name,
framework_version=framework_version,
py_version=py_version,
processor=processor,
tag=tag,
return image_utils.build_image(framework_version=framework_version,
dockerfile=dockerfile,
image_uri=image_uri,
region=region,
cwd=os.path.join(dir_path, '..'))

return tag
return image_uri


@pytest.fixture(scope='session', name='push_image', autouse=True)
def fixture_push_image(request, image_uri, region, aws_id):
push_image = request.config.getoption('--push-image')
if push_image:
return image_utils.push_image(image_uri, region, aws_id)
return None


@pytest.fixture(scope='session', name='sagemaker_session')
Expand All @@ -162,32 +165,80 @@ def fixture_instance_type(request, processor):
return provided_instance_type or default_instance_type


@pytest.fixture(name='accelerator_type', scope='session')
def fixture_accelerator_type(request):
return request.config.getoption('--accelerator-type')


@pytest.fixture(name='docker_registry', scope='session')
def fixture_docker_registry(aws_id, region):
return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region)
return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region) if aws_id else None


@pytest.fixture(name='image_uri', scope='session')
def fixture_image_uri(docker_registry, docker_base_name, tag):
if docker_registry:
return '{}/{}:{}'.format(docker_registry, docker_base_name, tag)
return '{}:{}'.format(docker_base_name, tag)


@pytest.fixture(scope='session', name='dist_cpu_backend', params=['gloo'])
def fixture_dist_cpu_backend(request):
return request.param


@pytest.fixture(name='ecr_image', scope='session')
def fixture_ecr_image(docker_registry, docker_base_name, tag):
return '{}/{}:{}'.format(docker_registry, docker_base_name, tag)
@pytest.fixture(scope='session', name='dist_gpu_backend', params=['gloo', 'nccl'])
def fixture_dist_gpu_backend(request):
return request.param


@pytest.fixture(autouse=True)
def skip_by_device_type(request, use_gpu, instance_type):
def skip_by_device_type(request, use_gpu, instance_type, accelerator_type):
is_gpu = use_gpu or instance_type[3] in ['g', 'p']
if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \
(request.node.get_closest_marker('skip_cpu') and not is_gpu):
is_eia = accelerator_type is not None

# Separate out cases for clearer logic.
# When running GPU test, skip CPU test. When running CPU test, skip GPU test.
if (request.node.get_closest_marker('gpu_test') and not is_gpu) or \
(request.node.get_closest_marker('cpu_test') and is_gpu):
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))

# When running EIA test, skip the CPU and GPU functions
elif (request.node.get_closest_marker('gpu_test') or request.node.get_closest_marker('cpu_test')) and is_eia:
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))

# When running CPU or GPU test, skip EIA test.
elif request.node.get_closest_marker('eia_test') and not is_eia:
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))


@pytest.fixture(autouse=True)
def skip_by_py_version(request, py_version):
"""
This will cause tests to be skipped w/ py3 containers if "py-version" flag is not set
and pytest is running from py2. We can rely on this when py2 is deprecated, but for now
we must use "skip_py2_containers"
"""
if request.node.get_closest_marker('skip_py2') and py_version != 'py3':
pytest.skip('Skipping the test because Python 2 is not supported.')


@pytest.fixture(autouse=True)
def skip_test_in_region(request, region):
if request.node.get_closest_marker('skip_test_in_region'):
if region == 'me-south-1':
pytest.skip('Skipping SageMaker test in region {}'.format(region))


@pytest.fixture(autouse=True)
def skip_gpu_instance_restricted_regions(region, instance_type):
if (region in NO_P2_REGIONS and instance_type.startswith('ml.p2')) \
or (region in NO_P3_REGIONS and instance_type.startswith('ml.p3')):
if ((region in NO_P2_REGIONS and instance_type.startswith('ml.p2'))
or (region in NO_P3_REGIONS and instance_type.startswith('ml.p3'))):
pytest.skip('Skipping GPU test in region {}'.format(region))


@pytest.fixture(autouse=True)
def skip_py2_containers(request, tag):
if request.node.get_closest_marker('skip_py2_containers'):
if 'py2' in tag:
pytest.skip('Skipping python2 container with tag {}'.format(tag))
6 changes: 6 additions & 0 deletions test-toolkit/docker/1.3.1/Dockerfile.dlc.eia
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference-eia:1.3.1-cpu-py3

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz
6 changes: 6 additions & 0 deletions test-toolkit/docker/1.4.0/Dockerfile.dlc.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.4.0-cpu-py3

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz
6 changes: 6 additions & 0 deletions test-toolkit/docker/1.4.0/Dockerfile.dlc.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ARG region
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.4.0-gpu-py3

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz
41 changes: 41 additions & 0 deletions test-toolkit/docker/1.4.0/Dockerfile.pytorch
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime

LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
LABEL com.amazonaws.sagemaker.capabilities.multi-models=true

ARG MMS_VERSION=1.0.8

ENV SAGEMAKER_SERVING_MODULE sagemaker_pytorch_serving_container.serving:main
ENV TEMP=/home/model-server/tmp

RUN apt-get update \
&& apt-get install -y --no-install-recommends \
libgl1-mesa-glx \
libglib2.0-0 \
libsm6 \
libxext6 \
libxrender-dev \
openjdk-8-jdk-headless \
&& rm -rf /var/lib/apt/lists/*

RUN conda install -c conda-forge opencv==4.0.1 \
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3

RUN pip install mxnet-model-server==$MMS_VERSION

COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
RUN pip install --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
rm /sagemaker_pytorch_inference.tar.gz

RUN useradd -m model-server \
&& mkdir -p /home/model-server/tmp \
&& chown -R model-server /home/model-server

COPY docker/build_artifacts/mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
COPY docker/build_artifacts/config.properties /home/model-server

RUN chmod +x /usr/local/bin/dockerd-entrypoint.py

EXPOSE 8080 8081
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
CMD ["mxnet-model-server", "--start", "--mms-config", "/home/model-server/config.properties"]
Loading