Skip to content

Commit e049d31

Browse files
authored
infra: refactor toolkit tests. (#56)
1 parent c20a4a2 commit e049d31

25 files changed

+377
-707
lines changed

.flake8

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[flake8]
2+
application_import_names = image_utils, integration, local_mode_utils, sagemaker_pytorch_container, test, test-toolkit, timeout, utils
3+
import-order-style = google

buildspec-release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ phases:
1212
# run unit tests
1313
- AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN=
1414
AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION=
15-
tox -e py27,py36 -- test/unit
15+
tox -e py36 -- test/unit
1616

1717
# run local integ tests
1818
#- $(aws ecr get-login --no-include-email --region us-west-2)

buildspec-toolkit.yml

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
version: 0.2
2+
3+
env:
4+
variables:
5+
FRAMEWORK_VERSION: '1.4.0'
6+
EIA_FRAMEWORK_VERSION: '1.3.1'
7+
CPU_INSTANCE_TYPE: 'ml.c4.xlarge'
8+
GPU_INSTANCE_TYPE: 'ml.p2.xlarge'
9+
EIA_ACCELERATOR_TYPE: 'ml.eia2.medium'
10+
ECR_REPO: 'sagemaker-test'
11+
GITHUB_REPO: 'sagemaker-pytorch-serving-container'
12+
DLC_ACCOUNT: '763104351884'
13+
SETUP_FILE: 'setup_cmds.sh'
14+
SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]'
15+
16+
phases:
17+
pre_build:
18+
commands:
19+
- start-dockerd
20+
- ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text)
21+
- PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO"
22+
- PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+')
23+
- BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')"
24+
- echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.'
25+
26+
build:
27+
commands:
28+
- TOX_PARALLEL_NO_SPINNER=1
29+
- PY_COLORS=0
30+
31+
# install
32+
- pip3 install -U -e .[test]
33+
34+
# run linters
35+
- tox -e flake8,twine
36+
37+
# run unit tests
38+
- tox -e py36 test-toolkit/unit
39+
40+
# define tags
41+
- GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID"
42+
- DLC_CPU_TAG="$FRAMEWORK_VERSION-dlc-cpu-$BUILD_ID"
43+
- DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID"
44+
- DLC_EIA_TAG="$FRAMEWORK_VERSION-dlc-eia-$BUILD_ID"
45+
46+
# run local CPU integration tests (build and push the image to ECR repo)
47+
- test_cmd="pytest test-toolkit/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG"
48+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
49+
- test_cmd="pytest test-toolkit/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG"
50+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
51+
52+
# launch remote GPU instance
53+
- prefix='ml.'
54+
- instance_type=${GPU_INSTANCE_TYPE#"$prefix"}
55+
- create-key-pair
56+
- launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu-latest
57+
58+
# build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test
59+
- python3 setup.py sdist
60+
- build_dir="test-toolkit/docker/$FRAMEWORK_VERSION"
61+
- $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
62+
- docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION .
63+
# push DLC GPU image to ECR
64+
- $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION)
65+
- docker push $PREPROD_IMAGE:$DLC_GPU_TAG
66+
67+
# run GPU local integration tests
68+
- printf "$SETUP_CMDS" > $SETUP_FILE
69+
# no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests
70+
- generic_cmd="pytest test-toolkit/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG"
71+
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\""
72+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
73+
- dlc_cmd="pytest test-toolkit/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG"
74+
- test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup"
75+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
76+
77+
# run CPU sagemaker integration tests
78+
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG"
79+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
80+
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG"
81+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
82+
83+
# run GPU sagemaker integration tests
84+
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG"
85+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
86+
- test_cmd="pytest test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG"
87+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
88+
89+
# run EIA sagemaker integration tests
90+
- test_cmd="pytest test-toolkit/integration/sagemaker --build-image --push-image --dockerfile-type dlc.eia --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $EIA_FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --accelerator-type $EIA_ACCELERATOR_TYPE --tag $DLC_EIA_TAG"
91+
- execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*"
92+
93+
finally:
94+
# shut down remote GPU instance
95+
- cleanup-gpu-instances
96+
- cleanup-key-pairs
97+
98+
# remove ECR image
99+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GENERIC_TAG
100+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_CPU_TAG
101+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_GPU_TAG
102+
- aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$DLC_EIA_TAG

test-toolkit/conftest.py

Lines changed: 95 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,11 @@
1818
import platform
1919
import pytest
2020
import shutil
21-
import sys
2221
import tempfile
2322

2423
from sagemaker import LocalSession, Session
25-
from sagemaker.pytorch import PyTorch
2624

27-
from test.utils import image_utils
25+
from utils import image_utils
2826

2927
logger = logging.getLogger(__name__)
3028
logging.getLogger('boto').setLevel(logging.INFO)
@@ -44,19 +42,36 @@
4442

4543

4644
def pytest_addoption(parser):
47-
parser.addoption('--build-image', '-D', action='store_true')
48-
parser.addoption('--build-base-image', '-B', action='store_true')
49-
parser.addoption('--aws-id')
45+
parser.addoption('--build-image', '-B', action='store_true')
46+
parser.addoption('--push-image', '-P', action='store_true')
47+
parser.addoption('--dockerfile-type', '-T',
48+
choices=['dlc.cpu', 'dlc.gpu', 'dlc.eia', 'pytorch'],
49+
default='pytorch')
50+
parser.addoption('--dockerfile', '-D', default=None)
51+
parser.addoption('--aws-id', default=None)
5052
parser.addoption('--instance-type')
51-
parser.addoption('--docker-base-name', default='pytorch')
53+
parser.addoption('--accelerator-type')
54+
parser.addoption('--docker-base-name', default='sagemaker-pytorch-inference')
5255
parser.addoption('--region', default='us-west-2')
53-
parser.addoption('--framework-version', default=PyTorch.LATEST_VERSION)
54-
parser.addoption('--py-version', choices=['2', '3'], default=str(sys.version_info.major))
56+
parser.addoption('--framework-version', default="1.4.0")
57+
parser.addoption('--py-version', choices=['2', '3'], default='3')
58+
# Processor is still "cpu" for EIA tests
5559
parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu')
5660
# If not specified, will default to {framework-version}-{processor}-py{py-version}
5761
parser.addoption('--tag', default=None)
5862

5963

64+
@pytest.fixture(scope='session', name='dockerfile_type')
65+
def fixture_dockerfile_type(request):
66+
return request.config.getoption('--dockerfile-type')
67+
68+
69+
@pytest.fixture(scope='session', name='dockerfile')
70+
def fixture_dockerfile(request, dockerfile_type):
71+
dockerfile = request.config.getoption('--dockerfile')
72+
return dockerfile if dockerfile else 'Dockerfile.{}'.format(dockerfile_type)
73+
74+
6075
@pytest.fixture(scope='session', name='docker_base_name')
6176
def fixture_docker_base_name(request):
6277
return request.config.getoption('--docker-base-name')
@@ -89,11 +104,6 @@ def fixture_tag(request, framework_version, processor, py_version):
89104
return provided_tag if provided_tag else default_tag
90105

91106

92-
@pytest.fixture(scope='session', name='docker_image')
93-
def fixture_docker_image(docker_base_name, tag):
94-
return '{}:{}'.format(docker_base_name, tag)
95-
96-
97107
@pytest.fixture
98108
def opt_ml():
99109
tmp = tempfile.mkdtemp()
@@ -112,32 +122,25 @@ def fixture_use_gpu(processor):
112122
return processor == 'gpu'
113123

114124

115-
@pytest.fixture(scope='session', name='build_base_image', autouse=True)
116-
def fixture_build_base_image(request, framework_version, py_version, processor, tag, docker_base_name):
117-
build_base_image = request.config.getoption('--build-base-image')
118-
if build_base_image:
119-
return image_utils.build_base_image(framework_name=docker_base_name,
120-
framework_version=framework_version,
121-
py_version=py_version,
122-
base_image_tag=tag,
123-
processor=processor,
124-
cwd=os.path.join(dir_path, '..'))
125-
126-
return tag
127-
128-
129125
@pytest.fixture(scope='session', name='build_image', autouse=True)
130-
def fixture_build_image(request, framework_version, py_version, processor, tag, docker_base_name):
126+
def fixture_build_image(request, framework_version, dockerfile, image_uri, region):
131127
build_image = request.config.getoption('--build-image')
132128
if build_image:
133-
return image_utils.build_image(framework_name=docker_base_name,
134-
framework_version=framework_version,
135-
py_version=py_version,
136-
processor=processor,
137-
tag=tag,
129+
return image_utils.build_image(framework_version=framework_version,
130+
dockerfile=dockerfile,
131+
image_uri=image_uri,
132+
region=region,
138133
cwd=os.path.join(dir_path, '..'))
139134

140-
return tag
135+
return image_uri
136+
137+
138+
@pytest.fixture(scope='session', name='push_image', autouse=True)
139+
def fixture_push_image(request, image_uri, region, aws_id):
140+
push_image = request.config.getoption('--push-image')
141+
if push_image:
142+
return image_utils.push_image(image_uri, region, aws_id)
143+
return None
141144

142145

143146
@pytest.fixture(scope='session', name='sagemaker_session')
@@ -162,32 +165,80 @@ def fixture_instance_type(request, processor):
162165
return provided_instance_type or default_instance_type
163166

164167

168+
@pytest.fixture(name='accelerator_type', scope='session')
169+
def fixture_accelerator_type(request):
170+
return request.config.getoption('--accelerator-type')
171+
172+
165173
@pytest.fixture(name='docker_registry', scope='session')
166174
def fixture_docker_registry(aws_id, region):
167-
return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region)
175+
return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region) if aws_id else None
176+
177+
178+
@pytest.fixture(name='image_uri', scope='session')
179+
def fixture_image_uri(docker_registry, docker_base_name, tag):
180+
if docker_registry:
181+
return '{}/{}:{}'.format(docker_registry, docker_base_name, tag)
182+
return '{}:{}'.format(docker_base_name, tag)
183+
184+
185+
@pytest.fixture(scope='session', name='dist_cpu_backend', params=['gloo'])
186+
def fixture_dist_cpu_backend(request):
187+
return request.param
168188

169189

170-
@pytest.fixture(name='ecr_image', scope='session')
171-
def fixture_ecr_image(docker_registry, docker_base_name, tag):
172-
return '{}/{}:{}'.format(docker_registry, docker_base_name, tag)
190+
@pytest.fixture(scope='session', name='dist_gpu_backend', params=['gloo', 'nccl'])
191+
def fixture_dist_gpu_backend(request):
192+
return request.param
173193

174194

175195
@pytest.fixture(autouse=True)
176-
def skip_by_device_type(request, use_gpu, instance_type):
196+
def skip_by_device_type(request, use_gpu, instance_type, accelerator_type):
177197
is_gpu = use_gpu or instance_type[3] in ['g', 'p']
178-
if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \
179-
(request.node.get_closest_marker('skip_cpu') and not is_gpu):
198+
is_eia = accelerator_type is not None
199+
200+
# Separate out cases for clearer logic.
201+
# When running GPU test, skip CPU test. When running CPU test, skip GPU test.
202+
if (request.node.get_closest_marker('gpu_test') and not is_gpu) or \
203+
(request.node.get_closest_marker('cpu_test') and is_gpu):
204+
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))
205+
206+
# When running EIA test, skip the CPU and GPU functions
207+
elif (request.node.get_closest_marker('gpu_test') or request.node.get_closest_marker('cpu_test')) and is_eia:
208+
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))
209+
210+
# When running CPU or GPU test, skip EIA test.
211+
elif request.node.get_closest_marker('eia_test') and not is_eia:
180212
pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type))
181213

182214

183215
@pytest.fixture(autouse=True)
184216
def skip_by_py_version(request, py_version):
217+
"""
218+
This will cause tests to be skipped w/ py3 containers if "py-version" flag is not set
219+
and pytest is running from py2. We can rely on this when py2 is deprecated, but for now
220+
we must use "skip_py2_containers"
221+
"""
185222
if request.node.get_closest_marker('skip_py2') and py_version != 'py3':
186223
pytest.skip('Skipping the test because Python 2 is not supported.')
187224

188225

226+
@pytest.fixture(autouse=True)
227+
def skip_test_in_region(request, region):
228+
if request.node.get_closest_marker('skip_test_in_region'):
229+
if region == 'me-south-1':
230+
pytest.skip('Skipping SageMaker test in region {}'.format(region))
231+
232+
189233
@pytest.fixture(autouse=True)
190234
def skip_gpu_instance_restricted_regions(region, instance_type):
191-
if (region in NO_P2_REGIONS and instance_type.startswith('ml.p2')) \
192-
or (region in NO_P3_REGIONS and instance_type.startswith('ml.p3')):
235+
if ((region in NO_P2_REGIONS and instance_type.startswith('ml.p2'))
236+
or (region in NO_P3_REGIONS and instance_type.startswith('ml.p3'))):
193237
pytest.skip('Skipping GPU test in region {}'.format(region))
238+
239+
240+
@pytest.fixture(autouse=True)
241+
def skip_py2_containers(request, tag):
242+
if request.node.get_closest_marker('skip_py2_containers'):
243+
if 'py2' in tag:
244+
pytest.skip('Skipping python2 container with tag {}'.format(tag))
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
ARG region
2+
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference-eia:1.3.1-cpu-py3
3+
4+
COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
5+
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
6+
rm /sagemaker_pytorch_inference.tar.gz
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
ARG region
2+
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.4.0-cpu-py3
3+
4+
COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
5+
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
6+
rm /sagemaker_pytorch_inference.tar.gz
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
ARG region
2+
FROM 763104351884.dkr.ecr.$region.amazonaws.com/pytorch-inference:1.4.0-gpu-py3
3+
4+
COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
5+
RUN pip install --upgrade --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
6+
rm /sagemaker_pytorch_inference.tar.gz
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime
2+
3+
LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true
4+
LABEL com.amazonaws.sagemaker.capabilities.multi-models=true
5+
6+
ARG MMS_VERSION=1.0.8
7+
8+
ENV SAGEMAKER_SERVING_MODULE sagemaker_pytorch_serving_container.serving:main
9+
ENV TEMP=/home/model-server/tmp
10+
11+
RUN apt-get update \
12+
&& apt-get install -y --no-install-recommends \
13+
libgl1-mesa-glx \
14+
libglib2.0-0 \
15+
libsm6 \
16+
libxext6 \
17+
libxrender-dev \
18+
openjdk-8-jdk-headless \
19+
&& rm -rf /var/lib/apt/lists/*
20+
21+
RUN conda install -c conda-forge opencv==4.0.1 \
22+
&& ln -s /opt/conda/bin/pip /usr/local/bin/pip3
23+
24+
RUN pip install mxnet-model-server==$MMS_VERSION
25+
26+
COPY dist/sagemaker_pytorch_inference-*.tar.gz /sagemaker_pytorch_inference.tar.gz
27+
RUN pip install --no-cache-dir /sagemaker_pytorch_inference.tar.gz && \
28+
rm /sagemaker_pytorch_inference.tar.gz
29+
30+
RUN useradd -m model-server \
31+
&& mkdir -p /home/model-server/tmp \
32+
&& chown -R model-server /home/model-server
33+
34+
COPY docker/build_artifacts/mms-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
35+
COPY docker/build_artifacts/config.properties /home/model-server
36+
37+
RUN chmod +x /usr/local/bin/dockerd-entrypoint.py
38+
39+
EXPOSE 8080 8081
40+
ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
41+
CMD ["mxnet-model-server", "--start", "--mms-config", "/home/model-server/config.properties"]

0 commit comments

Comments
 (0)