diff --git a/buildspec-release.yml b/buildspec-release.yml index 17e0f5ce..7285bb3b 100644 --- a/buildspec-release.yml +++ b/buildspec-release.yml @@ -12,7 +12,7 @@ phases: # run unit tests - AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_SESSION_TOKEN= AWS_CONTAINER_CREDENTIALS_RELATIVE_URI= AWS_DEFAULT_REGION= - tox -e py27,py36,py37 -- test-toolkit/unit + tox -e py27,py36,py37 -- test/unit # run local integ tests #- $(aws ecr get-login --no-include-email --region us-west-2) diff --git a/buildspec-toolkit.yml b/buildspec-toolkit.yml index 5e11ec1b..a7046e0b 100644 --- a/buildspec-toolkit.yml +++ b/buildspec-toolkit.yml @@ -33,7 +33,7 @@ phases: - tox -e flake8,twine # run unit tests - - tox -e py27,py36,py37 test-toolkit/unit + - tox -e py27,py36,py37 test/unit # define tags - GENERIC_TAG="$FRAMEWORK_VERSION-pytorch-$BUILD_ID" @@ -41,10 +41,10 @@ phases: - DLC_GPU_TAG="$FRAMEWORK_VERSION-dlc-gpu-$BUILD_ID" # run local CPU integration tests (build and push the image to ECR repo) - - test_cmd="pytest test-toolkit/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - test_cmd="pytest test-toolkit/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" + - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type pytorch --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" + - test_cmd="pytest test/integration/local --build-image --push-image --dockerfile-type dlc.cpu --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --tag $DLC_CPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" # launch remote GPU instance - prefix='ml.' @@ -54,7 +54,7 @@ phases: # build DLC GPU image because the base DLC image is too big and takes too long to build as part of the test - python3 setup.py sdist - - build_dir="test-toolkit/docker/$FRAMEWORK_VERSION" + - build_dir="test/container/$FRAMEWORK_VERSION" - $(aws ecr get-login --registry-ids $DLC_ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - docker build -f "$build_dir/Dockerfile.dlc.gpu" -t $PREPROD_IMAGE:$DLC_GPU_TAG --build-arg region=$AWS_DEFAULT_REGION . # push DLC GPU image to ECR @@ -64,24 +64,24 @@ phases: # run GPU local integration tests - printf "$SETUP_CMDS" > $SETUP_FILE # no reason to rebuild the image again since it was already built and pushed to ECR during CPU tests - - generic_cmd="pytest test-toolkit/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG" + - generic_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $GENERIC_TAG" - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$generic_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - - execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - dlc_cmd="pytest test-toolkit/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" + - dlc_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --tag $DLC_GPU_TAG" - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$dlc_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\" --skip-setup" - - execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" # run CPU sagemaker integration tests - - test_cmd="pytest -n 10 test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - test_cmd="pytest -n 10 test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" + - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" + - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $DLC_CPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" # run GPU sagemaker integration tests - - test_cmd="pytest -n 10 test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" - - test_cmd="pytest -n 10 test-toolkit/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test-toolkit/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" + - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GENERIC_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" + - test_cmd="pytest -n 10 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $DLC_GPU_TAG" + - execute-command-if-has-matching-changes "$test_cmd" "test/" "src/*.py" "setup.py" "setup.cfg" "buildspec-toolkit.yml" "docker/build_artifacts/*" finally: # shut down remote GPU instance diff --git a/buildspec.yml b/buildspec.yml index c1f46fa9..c43cb34f 100644 --- a/buildspec.yml +++ b/buildspec.yml @@ -1,88 +1,13 @@ version: 0.2 -env: - variables: - FRAMEWORK_VERSION: '1.5.0' - CPU_PY3_VERSION: '3' - CPU_INSTANCE_TYPE: 'ml.c4.xlarge' - GPU_PY3_VERSION: '3' - GPU_INSTANCE_TYPE: 'ml.p2.8xlarge' - LOCAL_BASE_REPO: 'pytorch-base' - ECR_REPO: 'sagemaker-test' - GITHUB_REPO: 'sagemaker-pytorch-container' - SETUP_FILE: 'setup_cmds.sh' - SETUP_CMDS: '#!/bin/bash\npip install --upgrade pip\npip install -U -e .\npip install -U -e .[test]' - - phases: pre_build: commands: - - start-dockerd - - ACCOUNT=$(aws --region $AWS_DEFAULT_REGION sts --endpoint-url https://sts.$AWS_DEFAULT_REGION.amazonaws.com get-caller-identity --query 'Account' --output text) - - PREPROD_IMAGE="$ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$ECR_REPO" - PR_NUM=$(echo $CODEBUILD_SOURCE_VERSION | grep -o '[0-9]\+') - - BUILD_ID="$(echo $CODEBUILD_BUILD_ID | sed -e 's/:/-/g')" - echo 'Pull request number:' $PR_NUM '. No value means this build is not from pull request.' build: commands: - # install - - pip3 install -U -e .[test] - - # run linter - - tox -e flake8 - - - cpu_dockerfile="Dockerfile.cpu" - - gpu_dockerfile="Dockerfile.gpu" - - # build py3 image - - build_dir="docker/$FRAMEWORK_VERSION/py$GPU_PY3_VERSION" - - cp -r docker/build_artifacts/* $build_dir/ - - CPU_PY3_TAG="$FRAMEWORK_VERSION-cpu-py3-$BUILD_ID" - - GPU_PY3_TAG="$FRAMEWORK_VERSION-gpu-py3-$BUILD_ID" - - build_cmd="docker build -f "$build_dir/$cpu_dockerfile" -t $PREPROD_IMAGE:$CPU_PY3_TAG $build_dir" - - execute-command-if-has-matching-changes "$build_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - build_cmd="docker build -f "$build_dir/$gpu_dockerfile" -t $PREPROD_IMAGE:$GPU_PY3_TAG $build_dir" - - execute-command-if-has-matching-changes "$build_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - # push images to ecr - - $(aws ecr get-login --registry-ids $ACCOUNT --no-include-email --region $AWS_DEFAULT_REGION) - - push_cmd="docker push $PREPROD_IMAGE:$CPU_PY3_TAG" - - execute-command-if-has-matching-changes "$push_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - push_cmd="docker push $PREPROD_IMAGE:$GPU_PY3_TAG" - - execute-command-if-has-matching-changes "$push_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - # launch remote gpu instance - - prefix='ml.' - - instance_type=${GPU_INSTANCE_TYPE#"$prefix"} - - create-key-pair - - launch-ec2-instance --instance-type $instance_type --ami-name dlami-ubuntu - - # run cpu integration tests - - test_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY3_VERSION --processor cpu --tag $CPU_PY3_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - # run gpu integration tests - - printf "$SETUP_CMDS" > $SETUP_FILE - - py3_cmd="pytest test/integration/local --region $AWS_DEFAULT_REGION --docker-base-name $PREPROD_IMAGE --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY3_VERSION --processor gpu --tag $GPU_PY3_TAG" - - test_cmd="remote-test --github-repo $GITHUB_REPO --test-cmd \"$py3_cmd\" --setup-file $SETUP_FILE --pr-number \"$PR_NUM\"" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - # run cpu sagemaker tests - - test_cmd="pytest -n 15 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $CPU_PY3_VERSION --processor cpu --instance-type $CPU_INSTANCE_TYPE --tag $CPU_PY3_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - # run gpu sagemaker tests - - test_cmd="pytest -n 15 test/integration/sagemaker --region $AWS_DEFAULT_REGION --docker-base-name $ECR_REPO --aws-id $ACCOUNT --framework-version $FRAMEWORK_VERSION --py-version $GPU_PY3_VERSION --processor gpu --instance-type $GPU_INSTANCE_TYPE --tag $GPU_PY3_TAG" - - execute-command-if-has-matching-changes "$test_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - finally: - # shut down remote gpu instance - - cleanup-gpu-instances - - cleanup-key-pairs - # remove ecr image - - rm_cmd="aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$CPU_PY3_TAG" - - execute-command-if-has-matching-changes "$rm_cmd" "test/" "tests/" "docker/*" "buildspec.yml" - - rm_cmd="aws ecr batch-delete-image --repository-name $ECR_REPO --region $AWS_DEFAULT_REGION --image-ids imageTag=$GPU_PY3_TAG" - - execute-command-if-has-matching-changes "$rm_cmd" "test/" "tests/" "docker/*" "buildspec.yml" + - error_cmd="echo 'In order to make changes to the docker files, please, use https://github.com/aws/deep-learning-containers repository.' && exit 1" + - execute-command-if-has-matching-changes "$error_cmd" "docker/" diff --git a/test-toolkit/conftest.py b/test-toolkit/conftest.py deleted file mode 100644 index 13e28de3..00000000 --- a/test-toolkit/conftest.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import boto3 -import os -import logging -import platform -import pytest -import shutil -import sys -import tempfile - -from sagemaker import LocalSession, Session - -from utils import image_utils - -logger = logging.getLogger(__name__) -logging.getLogger('boto').setLevel(logging.INFO) -logging.getLogger('boto3').setLevel(logging.INFO) -logging.getLogger('botocore').setLevel(logging.INFO) -logging.getLogger('factory.py').setLevel(logging.INFO) -logging.getLogger('auth.py').setLevel(logging.INFO) -logging.getLogger('connectionpool.py').setLevel(logging.INFO) - - -dir_path = os.path.dirname(os.path.realpath(__file__)) - -NO_P2_REGIONS = ['ap-east-1', 'ap-northeast-3', 'ap-southeast-2', 'ca-central-1', 'eu-central-1', 'eu-north-1', - 'eu-west-2', 'eu-west-3', 'us-west-1', 'sa-east-1', 'me-south-1'] -NO_P3_REGIONS = ['ap-east-1', 'ap-northeast-3', 'ap-southeast-1', 'ap-southeast-2', 'ap-south-1', 'ca-central-1', - 'eu-central-1', 'eu-north-1', 'eu-west-2', 'eu-west-3', 'sa-east-1', 'us-west-1', 'me-south-1'] - - -def pytest_addoption(parser): - parser.addoption('--build-image', '-B', action='store_true') - parser.addoption('--push-image', '-P', action='store_true') - parser.addoption('--dockerfile-type', '-T', choices=['dlc.cpu', 'dlc.gpu', 'pytorch'], - default='pytorch') - parser.addoption('--dockerfile', '-D', default=None) - parser.addoption('--aws-id', default=None) - parser.addoption('--instance-type') - parser.addoption('--docker-base-name', default='sagemaker-pytorch-training') - parser.addoption('--region', default='us-west-2') - parser.addoption('--framework-version', default="1.4.0") - parser.addoption('--py-version', choices=['2', '3'], default=str(sys.version_info.major)) - parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu') - # If not specified, will default to {framework-version}-{processor}-py{py-version} - parser.addoption('--tag', default=None) - - -@pytest.fixture(scope='session', name='dockerfile_type') -def fixture_dockerfile_type(request): - return request.config.getoption('--dockerfile-type') - - -@pytest.fixture(scope='session', name='dockerfile') -def fixture_dockerfile(request, dockerfile_type): - dockerfile = request.config.getoption('--dockerfile') - return dockerfile if dockerfile else 'Dockerfile.{}'.format(dockerfile_type) - - -@pytest.fixture(scope='session', name='docker_base_name') -def fixture_docker_base_name(request): - return request.config.getoption('--docker-base-name') - - -@pytest.fixture(scope='session', name='region') -def fixture_region(request): - return request.config.getoption('--region') - - -@pytest.fixture(scope='session', name='framework_version') -def fixture_framework_version(request): - return request.config.getoption('--framework-version') - - -@pytest.fixture(scope='session', name='py_version') -def fixture_py_version(request): - return 'py{}'.format(int(request.config.getoption('--py-version'))) - - -@pytest.fixture(scope='session', name='processor') -def fixture_processor(request): - return request.config.getoption('--processor') - - -@pytest.fixture(scope='session', name='tag') -def fixture_tag(request, framework_version, processor, py_version): - provided_tag = request.config.getoption('--tag') - default_tag = '{}-{}-{}'.format(framework_version, processor, py_version) - return provided_tag if provided_tag else default_tag - - -@pytest.fixture -def opt_ml(): - tmp = tempfile.mkdtemp() - os.mkdir(os.path.join(tmp, 'output')) - - # Docker cannot mount Mac OS /var folder properly see - # https://forums.docker.com/t/var-folders-isnt-mounted-properly/9600 - opt_ml_dir = '/private{}'.format(tmp) if platform.system() == 'Darwin' else tmp - yield opt_ml_dir - - shutil.rmtree(tmp, True) - - -@pytest.fixture(scope='session', name='use_gpu') -def fixture_use_gpu(processor): - return processor == 'gpu' - - -@pytest.fixture(scope='session', name='build_image', autouse=True) -def fixture_build_image(request, framework_version, dockerfile, image_uri, region): - build_image = request.config.getoption('--build-image') - if build_image: - return image_utils.build_image(framework_version=framework_version, - dockerfile=dockerfile, - image_uri=image_uri, - region=region, - cwd=os.path.join(dir_path, '..')) - - return image_uri - - -@pytest.fixture(scope='session', name='push_image', autouse=True) -def fixture_push_image(request, image_uri, region, aws_id): - push_image = request.config.getoption('--push-image') - if push_image: - return image_utils.push_image(image_uri, region, aws_id) - return None - - -@pytest.fixture(scope='session', name='sagemaker_session') -def fixture_sagemaker_session(region): - return Session(boto_session=boto3.Session(region_name=region)) - - -@pytest.fixture(scope='session', name='sagemaker_local_session') -def fixture_sagemaker_local_session(region): - return LocalSession(boto_session=boto3.Session(region_name=region)) - - -@pytest.fixture(name='aws_id', scope='session') -def fixture_aws_id(request): - return request.config.getoption('--aws-id') - - -@pytest.fixture(name='instance_type', scope='session') -def fixture_instance_type(request, processor): - provided_instance_type = request.config.getoption('--instance-type') - default_instance_type = 'local' if processor == 'cpu' else 'local_gpu' - return provided_instance_type or default_instance_type - - -@pytest.fixture(name='docker_registry', scope='session') -def fixture_docker_registry(aws_id, region): - return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region) if aws_id else None - - -@pytest.fixture(name='image_uri', scope='session') -def fixture_image_uri(docker_registry, docker_base_name, tag): - if docker_registry: - return '{}/{}:{}'.format(docker_registry, docker_base_name, tag) - return '{}:{}'.format(docker_base_name, tag) - - -@pytest.fixture(scope='session', name='dist_cpu_backend', params=['gloo']) -def fixture_dist_cpu_backend(request): - return request.param - - -@pytest.fixture(scope='session', name='dist_gpu_backend', params=['gloo', 'nccl']) -def fixture_dist_gpu_backend(request): - return request.param - - -@pytest.fixture(autouse=True) -def skip_by_device_type(request, use_gpu, instance_type): - is_gpu = use_gpu or instance_type[3] in ['g', 'p'] - if (request.node.get_closest_marker('skip_gpu') and is_gpu) or \ - (request.node.get_closest_marker('skip_cpu') and not is_gpu): - pytest.skip('Skipping because running on \'{}\' instance'.format(instance_type)) - - -@pytest.fixture(autouse=True) -def skip_by_py_version(request, py_version): - """ - This will cause tests to be skipped w/ py3 containers if "py-version" flag is not set - and pytest is running from py2. We can rely on this when py2 is deprecated, but for now - we must use "skip_py2_containers" - """ - if request.node.get_closest_marker('skip_py2') and py_version != 'py3': - pytest.skip('Skipping the test because Python 2 is not supported.') - - -@pytest.fixture(autouse=True) -def skip_test_in_region(request, region): - if request.node.get_closest_marker('skip_test_in_region'): - if region == 'me-south-1': - pytest.skip('Skipping SageMaker test in region {}'.format(region)) - - -@pytest.fixture(autouse=True) -def skip_gpu_instance_restricted_regions(region, instance_type): - if ((region in NO_P2_REGIONS and instance_type.startswith('ml.p2')) - or (region in NO_P3_REGIONS and instance_type.startswith('ml.p3'))): - pytest.skip('Skipping GPU test in region {}'.format(region)) - - -@pytest.fixture(autouse=True) -def skip_py2_containers(request, tag): - if request.node.get_closest_marker('skip_py2_containers'): - if 'py2' in tag: - pytest.skip('Skipping python2 container with tag {}'.format(tag)) diff --git a/test-toolkit/integration/__init__.py b/test-toolkit/integration/__init__.py deleted file mode 100644 index e879ae0e..00000000 --- a/test-toolkit/integration/__init__.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os - -resources_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'resources')) -mnist_path = os.path.join(resources_path, 'mnist') -mnist_script = os.path.join(mnist_path, 'mnist.py') -fastai_path = os.path.join(resources_path, 'fastai') -fastai_cifar_script = os.path.join(fastai_path, 'train_cifar.py') -fastai_mnist_script = os.path.join(fastai_path, 'mnist.py') -data_dir = os.path.join(mnist_path, 'data') -training_dir = os.path.join(data_dir, 'training') -dist_operations_path = os.path.join(resources_path, 'distributed_operations.py') -smdebug_mnist_script = os.path.join(mnist_path, 'smdebug_mnist.py') - -mnist_1d_script = os.path.join(mnist_path, 'mnist_1d.py') -model_cpu_dir = os.path.join(mnist_path, 'model_cpu') -model_cpu_1d_dir = os.path.join(model_cpu_dir, '1d') -model_gpu_dir = os.path.join(mnist_path, 'model_gpu') -model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d') -call_model_fn_once_script = os.path.join(resources_path, 'call_model_fn_once.py') - -requirements_dir = os.path.join(resources_path, 'requirements') -requirements_script = 'entry.py' - -ROLE = 'dummy/unused-role' -DEFAULT_TIMEOUT = 20 -PYTHON3 = 'py3' diff --git a/test-toolkit/integration/local/__init__.py b/test-toolkit/integration/local/__init__.py deleted file mode 100644 index 79cb9cdf..00000000 --- a/test-toolkit/integration/local/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import diff --git a/test-toolkit/integration/local/test_distributed_training.py b/test-toolkit/integration/local/test_distributed_training.py deleted file mode 100644 index 553110a8..00000000 --- a/test-toolkit/integration/local/test_distributed_training.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os - -import pytest -from sagemaker.pytorch import PyTorch - -from integration import data_dir, dist_operations_path, mnist_script, ROLE -from utils.local_mode_utils import assert_files_exist - -MODEL_SUCCESS_FILES = { - 'model': ['success'], - 'output': ['success'], -} - - -@pytest.fixture(scope='session', name='dist_gpu_backend', params=['gloo']) -def fixture_dist_gpu_backend(request): - return request.param - - -@pytest.mark.skip_gpu -def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir): - estimator = PyTorch(entry_point=dist_operations_path, - role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', - sagemaker_session=sagemaker_local_session, - hyperparameters={'backend': dist_cpu_backend}, - output_path='file://{}'.format(tmpdir)) - - _train_and_assert_success(estimator, str(tmpdir)) - - -@pytest.mark.skip_cpu -def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdir): - estimator = PyTorch(entry_point=dist_operations_path, - role=ROLE, - image_name=image_uri, - train_instance_count=1, - train_instance_type='local_gpu', - sagemaker_session=sagemaker_local_session, - hyperparameters={'backend': 'nccl'}, - output_path='file://{}'.format(tmpdir)) - - _train_and_assert_success(estimator, str(tmpdir)) - - -@pytest.mark.skip_gpu -def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir): - estimator = PyTorch(entry_point=mnist_script, - role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', - sagemaker_session=sagemaker_local_session, - hyperparameters={'backend': 'nccl'}, - output_path='file://{}'.format(tmpdir)) - - with pytest.raises(RuntimeError): - estimator.fit({'training': 'file://{}'.format(os.path.join(data_dir, 'training'))}) - - failure_file = {'output': ['failure']} - assert_files_exist(str(tmpdir), failure_file) - - -@pytest.mark.skip_gpu -def test_mnist_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir): - estimator = PyTorch(entry_point=mnist_script, - role=ROLE, - image_name=image_uri, - train_instance_count=2, - train_instance_type='local', - sagemaker_session=sagemaker_local_session, - hyperparameters={'backend': dist_cpu_backend}, - output_path='file://{}'.format(tmpdir)) - - success_files = { - 'model': ['model.pth'], - 'output': ['success'], - } - _train_and_assert_success(estimator, str(tmpdir), success_files) - - -def _train_and_assert_success(estimator, output_path, output_files=MODEL_SUCCESS_FILES): - estimator.fit({'training': 'file://{}'.format(os.path.join(data_dir, 'training'))}) - assert_files_exist(output_path, output_files) diff --git a/test-toolkit/integration/local/test_single_machine_training.py b/test-toolkit/integration/local/test_single_machine_training.py deleted file mode 100644 index 68f99859..00000000 --- a/test-toolkit/integration/local/test_single_machine_training.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os - -from sagemaker.pytorch import PyTorch - -from utils.local_mode_utils import assert_files_exist -from integration import data_dir, mnist_script, ROLE - - -def test_mnist(image_uri, processor, instance_type, sagemaker_local_session, tmpdir): - estimator = PyTorch(entry_point=mnist_script, - role=ROLE, - image_name=image_uri, - train_instance_count=1, - train_instance_type=instance_type, - sagemaker_session=sagemaker_local_session, - hyperparameters={'processor': processor}, - output_path='file://{}'.format(tmpdir)) - - _train_and_assert_success(estimator, data_dir, str(tmpdir)) - - -def _train_and_assert_success(estimator, input_dir, output_path): - estimator.fit({'training': 'file://{}'.format(os.path.join(input_dir, 'training'))}) - - success_files = { - 'model': ['model.pth'], - 'output': ['success'], - } - assert_files_exist(output_path, success_files) diff --git a/test-toolkit/integration/sagemaker/__init__.py b/test-toolkit/integration/sagemaker/__init__.py deleted file mode 100644 index 79cb9cdf..00000000 --- a/test-toolkit/integration/sagemaker/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import diff --git a/test-toolkit/integration/sagemaker/test_distributed_operations.py b/test-toolkit/integration/sagemaker/test_distributed_operations.py deleted file mode 100644 index 6e072598..00000000 --- a/test-toolkit/integration/sagemaker/test_distributed_operations.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import os - -import boto3 -import pytest -from sagemaker import utils -from sagemaker.pytorch import PyTorch -from six.moves.urllib.parse import urlparse - -from integration import data_dir, dist_operations_path, mnist_script, DEFAULT_TIMEOUT -from integration.sagemaker.timeout import timeout - -MULTI_GPU_INSTANCE = 'ml.p3.8xlarge' - - -@pytest.mark.skip_gpu -@pytest.mark.deploy_test -@pytest.mark.skip_test_in_region -def test_dist_operations_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend): - instance_type = instance_type or 'ml.c4.xlarge' - _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_cpu_backend) - - -@pytest.mark.skip_cpu -@pytest.mark.deploy_test -def test_dist_operations_gpu(sagemaker_session, instance_type, image_uri, dist_gpu_backend): - instance_type = instance_type or 'ml.p2.xlarge' - _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_gpu_backend) - - -@pytest.mark.skip_cpu -def test_dist_operations_multi_gpu(sagemaker_session, image_uri, dist_gpu_backend): - _test_dist_operations(sagemaker_session, image_uri, MULTI_GPU_INSTANCE, dist_gpu_backend, 1) - - -@pytest.mark.skip_cpu -@pytest.mark.skip_py2_containers -def test_mnist_gpu(sagemaker_session, image_uri, dist_gpu_backend): - with timeout(minutes=DEFAULT_TIMEOUT): - pytorch = PyTorch(entry_point=mnist_script, - role='SageMakerRole', - train_instance_count=2, - image_name=image_uri, - train_instance_type=MULTI_GPU_INSTANCE, - sagemaker_session=sagemaker_session, - hyperparameters={'backend': dist_gpu_backend}) - - training_input = sagemaker_session.upload_data(path=os.path.join(data_dir, 'training'), - key_prefix='pytorch/mnist') - - job_name = utils.unique_name_from_base('test-pytorch-dist-ops') - pytorch.fit({'training': training_input}, job_name=job_name) - - -def _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_backend, train_instance_count=3): - with timeout(minutes=DEFAULT_TIMEOUT): - pytorch = PyTorch(entry_point=dist_operations_path, - role='SageMakerRole', - train_instance_count=train_instance_count, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=image_uri, - hyperparameters={'backend': dist_backend}) - - pytorch.sagemaker_session.default_bucket() - fake_input = pytorch.sagemaker_session.upload_data(path=dist_operations_path, - key_prefix='pytorch/distributed_operations') - - job_name = utils.unique_name_from_base('test-pytorch-dist-ops') - pytorch.fit({'required_argument': fake_input}, job_name=job_name) - - -def _assert_s3_file_exists(region, s3_url): - parsed_url = urlparse(s3_url) - s3 = boto3.resource('s3', region_name=region) - s3.Object(parsed_url.netloc, parsed_url.path.lstrip('/')).load() diff --git a/test-toolkit/integration/sagemaker/test_mnist.py b/test-toolkit/integration/sagemaker/test_mnist.py deleted file mode 100644 index 0a7fec96..00000000 --- a/test-toolkit/integration/sagemaker/test_mnist.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import pytest -from sagemaker import utils -from sagemaker.pytorch import PyTorch - -from integration import training_dir, mnist_script, DEFAULT_TIMEOUT -from integration.sagemaker.timeout import timeout - - -@pytest.mark.skip_gpu -def test_mnist_distributed_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend): - instance_type = instance_type or 'ml.c4.xlarge' - _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_cpu_backend) - - -@pytest.mark.skip_cpu -def test_mnist_distributed_gpu(sagemaker_session, image_uri, instance_type, dist_gpu_backend): - instance_type = instance_type or 'ml.p2.xlarge' - _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_gpu_backend) - - -def _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_backend): - with timeout(minutes=DEFAULT_TIMEOUT): - pytorch = PyTorch(entry_point=mnist_script, - role='SageMakerRole', - train_instance_count=2, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=image_uri, - hyperparameters={'backend': dist_backend, 'epochs': 2}) - training_input = pytorch.sagemaker_session.upload_data(path=training_dir, - key_prefix='pytorch/mnist') - - job_name = utils.unique_name_from_base('test-pytorch-mnist') - - pytorch.fit({'training': training_input}, job_name=job_name) diff --git a/test-toolkit/integration/sagemaker/timeout.py b/test-toolkit/integration/sagemaker/timeout.py deleted file mode 100644 index 1b941589..00000000 --- a/test-toolkit/integration/sagemaker/timeout.py +++ /dev/null @@ -1,82 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -# TODO: this is used in all containers and sdk. We should move it to container support or sdk test utils. -from __future__ import absolute_import -import signal -from contextlib import contextmanager -import logging - -from botocore.exceptions import ClientError - -LOGGER = logging.getLogger('timeout') - - -class TimeoutError(Exception): - pass - - -@contextmanager -def timeout(seconds=0, minutes=0, hours=0): - """Add a signal-based timeout to any block of code. - If multiple time units are specified, they will be added together to determine time limit. - Usage: - with timeout(seconds=5): - my_slow_function(...) - Args: - - seconds: The time limit, in seconds. - - minutes: The time limit, in minutes. - - hours: The time limit, in hours. - """ - - limit = seconds + 60 * minutes + 3600 * hours - - def handler(signum, frame): - raise TimeoutError('timed out after {} seconds'.format(limit)) - - try: - signal.signal(signal.SIGALRM, handler) - signal.alarm(limit) - - yield - finally: - signal.alarm(0) - - -@contextmanager -def timeout_and_delete_endpoint(estimator, seconds=0, minutes=0, hours=0): - with timeout(seconds=seconds, minutes=minutes, hours=hours) as t: - try: - yield [t] - finally: - try: - estimator.delete_endpoint() - LOGGER.info('deleted endpoint') - except ClientError as ce: - if ce.response['Error']['Code'] == 'ValidationException': - # avoids the inner exception to be overwritten - pass - - -@contextmanager -def timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session, seconds=0, minutes=0, hours=0): - with timeout(seconds=seconds, minutes=minutes, hours=hours) as t: - try: - yield [t] - finally: - try: - sagemaker_session.delete_endpoint(endpoint_name) - LOGGER.info('deleted endpoint {}'.format(endpoint_name)) - except ClientError as ce: - if ce.response['Error']['Code'] == 'ValidationException': - # avoids the inner exception to be overwritten - pass diff --git a/test-toolkit/resources/distributed_operations.py b/test-toolkit/resources/distributed_operations.py deleted file mode 100644 index e26cac2e..00000000 --- a/test-toolkit/resources/distributed_operations.py +++ /dev/null @@ -1,268 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import -import argparse -import json -import logging -import os -import sys -import torch -import torch.distributed as dist -from torch.multiprocessing import Process -import torch.utils.data -import torch.utils.data.distributed - -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) -logger.addHandler(logging.StreamHandler(sys.stdout)) - - -def _get_tensor(rank, rows, columns): - device = torch.device( - "cuda:{}".format(dist.get_rank() % torch.cuda.device_count()) if torch.cuda.is_available() - else "cpu" - ) - tensor = torch.ones(rows, columns) * (rank + 1) - return tensor.to(device) - - -def _get_zeros_tensor(rows, columns): - device = torch.device( - "cuda:{}".format(dist.get_rank() % torch.cuda.device_count()) if torch.cuda.is_available() - else "cpu" - ) - tensor = torch.zeros(rows, columns) - return tensor.to(device) - - -def _get_zeros_tensors_list(rows, columns): - return [_get_zeros_tensor(rows, columns) for _ in range(dist.get_world_size())] - - -def _get_tensors_sum(rows, columns): - device = torch.device( - "cuda:{}".format(dist.get_rank() % torch.cuda.device_count()) if torch.cuda.is_available() - else "cpu" - ) - result = (1 + dist.get_world_size()) * dist.get_world_size() / 2 - tensor = torch.ones(rows, columns) * result - return tensor.to(device) - - -def _send_recv(rank, rows, columns): - source = 0 - tensor = _get_tensor(rank, rows, columns) - logger.debug('Rank: {},\nTensor BEFORE send_recv: {}'.format(rank, tensor)) - if rank == 0: - for i in range(1, dist.get_world_size()): - dist.send(tensor=tensor, dst=i) - else: - dist.recv(tensor=tensor, src=source) - logger.debug('Rank: {},\nTensor AFTER send_recv: {}\n'.format(rank, tensor)) - - assert torch.equal(tensor, _get_tensor(source, rows, columns)),\ - 'Rank {}: Tensor was not equal to rank {} tensor after send-recv.'.format(rank, source) - - -def _broadcast(rank, rows, columns): - source = 0 - tensor = _get_tensor(rank, rows, columns) - logger.debug('Rank: {},\nTensor BEFORE broadcast: {}'.format(rank, tensor)) - dist.broadcast(tensor, src=source) - logger.debug('Rank: {},\nTensor AFTER broadcast: {}\n'.format(rank, tensor)) - - assert torch.equal(tensor, _get_tensor(source, rows, columns)), \ - 'Rank {}: Tensor was not equal to rank {} tensor after broadcast.'.format(rank, source) - - -def _all_reduce(rank, rows, columns): - tensor = _get_tensor(rank, rows, columns) - logger.debug('Rank: {},\nTensor BEFORE all_reduce: {}'.format(rank, tensor)) - dist.all_reduce(tensor, op=dist.reduce_op.SUM) - logger.debug('Rank: {},\nTensor AFTER all_reduce: {}\n'.format(rank, tensor)) - - assert torch.equal(tensor, _get_tensors_sum(rows, columns)), \ - 'Rank {}: Tensor was not equal to SUM of {} tensors after all_reduce.'.format(rank, dist.get_world_size()) - - -def _reduce(rank, rows, columns): - dest = 0 - tensor = _get_tensor(rank, rows, columns) - logger.debug('Rank: {},\nTensor BEFORE reduce: {}'.format(rank, tensor)) - # this is inplace operation - dist.reduce(tensor, op=dist.reduce_op.SUM, dst=dest) - logger.debug('Rank: {},\nTensor AFTER reduce: {}\n'.format(rank, tensor)) - - if rank == dest: - assert torch.equal(tensor, _get_tensors_sum(rows, columns)), \ - 'Rank {}: Tensor was not equal to SUM of {} tensors after reduce.'.format(rank, dist.get_world_size()) - - -def _all_gather(rank, rows, columns): - tensor = _get_tensor(rank, rows, columns) - tensors_list = _get_zeros_tensors_list(rows, columns) - logger.debug('Rank: {},\nTensor BEFORE all_gather: {}'.format(rank, tensor)) - dist.all_gather(tensors_list, tensor) - logger.debug('Rank: {},\nTensor AFTER all_gather: {}. tensors_list: {}\n'.format( - rank, tensor, tensors_list)) - - # tensor shouldn't have changed - assert torch.equal(tensor, _get_tensor(rank, rows, columns)), \ - 'Rank {}: Tensor got changed after all_gather.'.format(rank) - for i in range(dist.get_world_size()): - assert torch.equal(tensors_list[i], _get_tensor(i, rows, columns)), \ - 'Rank {}: tensors lists are not the same after all_gather.' - - -def _gather(rank, rows, columns): - dest = 0 - tensor = _get_tensor(rank, rows, columns) - if rank == dest: - tensors_list = _get_zeros_tensors_list(rows, columns) - logger.debug('Rank: {},\nTensor BEFORE gather: {}. tensors_list: {}'.format( - rank, tensor, tensors_list)) - dist.gather(tensor=tensor, gather_list=tensors_list) - logger.debug('Rank: {},\nTensor AFTER gather: {}. tensors_list: {}\n'.format( - rank, tensor, tensors_list)) - for i in range(dist.get_world_size()): - assert torch.equal(tensors_list[i], _get_tensor(i, rows, columns)), \ - 'Rank {}: tensors lists are not the same after gather.' - else: - logger.debug('Rank: {},\nTensor BEFORE gather: {}\n'.format(rank, tensor)) - dist.gather(tensor=tensor, dst=dest) - logger.debug('Rank: {},\nTensor AFTER gather: {}\n'.format(rank, tensor)) - - # tensor shouldn't have changed - assert torch.equal(tensor, _get_tensor(rank, rows, columns)), \ - 'Rank {}: Tensor got changed after gather.'.format(rank) - - -def _scatter(rank, rows, columns): - source = 0 - tensor = _get_tensor(rank, rows, columns) - if rank == source: - tensors_list = _get_zeros_tensors_list(rows, columns) - logger.debug('Rank: {},\nTensor BEFORE scatter: {}. tensors_list: {}'.format( - rank, tensor, tensors_list)) - dist.scatter(tensor=tensor, scatter_list=tensors_list) - else: - logger.debug('Rank: {},\nTensor BEFORE scatter: {}\n'.format(rank, tensor)) - dist.scatter(tensor=tensor, src=source) - logger.debug('Rank: {},\nTensor AFTER scatter: {}\n'.format(rank, tensor)) - - assert torch.equal(tensor, _get_zeros_tensor(rows, columns)), \ - 'Rank {}: Tensor should be all zeroes after scatter.'.format(rank) - - -def _barrier(rank): - logger.debug('Rank: {}, Waiting for other processes before the barrier.'.format(rank)) - dist.barrier() - logger.debug('Rank: {}, Passing the barrier'.format(rank)) - - -def main(): - print('Starting') - parser = argparse.ArgumentParser() - # Configurable hyperparameters - parser.add_argument('--rows', type=int, default=1, - help='Number of rows in the tensor.') - parser.add_argument('--columns', type=int, default=1, - help='Number of columns in the tensor.') - parser.add_argument('--backend', type=str, default=None, - help='backend for distributed operations.') - - # Container environment - parser.add_argument('--hosts', type=list, default=json.loads(os.environ["SM_HOSTS"])) - parser.add_argument('--current-host', type=str, default=os.environ["SM_CURRENT_HOST"]) - parser.add_argument('--model-dir', type=str, default=os.environ["SM_MODEL_DIR"]) - parser.add_argument('--num-gpus', type=int, default=os.environ["SM_NUM_GPUS"]) - parser.add_argument('--num-cpus', type=int, default=os.environ["SM_NUM_CPUS"]) - - args = parser.parse_args() - - number_of_processes = args.num_gpus if args.num_gpus > 0 else args.num_cpus - world_size = number_of_processes * len(args.hosts) - logger.info('Running \'{}\' backend on {} nodes and {} processes. World size is {}.'.format( - args.backend, len(args.hosts), number_of_processes, world_size - )) - host_rank = args.hosts.index(args.current_host) - master_addr = args.hosts[0] - master_port = '55555' - processes = [] - for rank in range(number_of_processes): - process_rank = host_rank * number_of_processes + rank - p = Process( - target=init_processes, - args=(args.backend, - master_addr, - master_port, - process_rank, - world_size, - args.rows, - args.columns, - args.current_host, - args.num_gpus) - ) - p.start() - processes.append(p) - - for p in processes: - p.join() - - save('success', args.model_dir) - - -def init_processes(backend, master_addr, master_port, rank, world_size, - rows, columns, host, num_gpus): - # Initialize the distributed environment. - os.environ['WORLD_SIZE'] = str(world_size) - os.environ['RANK'] = str(rank) - os.environ['MASTER_ADDR'] = master_addr - os.environ['MASTER_PORT'] = master_port - - logger.info('Init process rank {} on host \'{}\''.format(rank, host)) - dist.init_process_group(backend=backend, rank=rank, world_size=world_size) - run(backend, rank, rows, columns, num_gpus) - - -def run(backend, rank, rows, columns, num_gpus): - # https://pytorch.org/docs/master/distributed.html - if backend == 'gloo': - print('Run operations supported by \'gloo\' backend.') - _broadcast(rank, rows, columns) - _all_reduce(rank, rows, columns) - _barrier(rank) - - # this operation supported only on cpu - if num_gpus == 0: - _send_recv(rank, rows, columns) - elif backend == 'nccl': - print('Run operations supported by \'nccl\' backend.') - # Note: nccl does not support gather or scatter as well: - # https://github.com/pytorch/pytorch/blob/v0.4.0/torch/lib/THD/base/data_channels/DataChannelNccl.cpp - _broadcast(rank, rows, columns) - _all_reduce(rank, rows, columns) - _reduce(rank, rows, columns) - _all_gather(rank, rows, columns) - - -def save(result, model_dir): - filename = os.path.join(model_dir, result) - if not os.path.exists(filename): - logger.info("Saving success result") - with open(filename, 'w') as f: - f.write(result) - - -if __name__ == '__main__': - main() diff --git a/test-toolkit/resources/mnist/data/training/MNIST/processed/test.pt b/test-toolkit/resources/mnist/data/training/MNIST/processed/test.pt deleted file mode 100644 index 1d2112e0..00000000 Binary files a/test-toolkit/resources/mnist/data/training/MNIST/processed/test.pt and /dev/null differ diff --git a/test-toolkit/resources/mnist/data/training/MNIST/processed/training.pt b/test-toolkit/resources/mnist/data/training/MNIST/processed/training.pt deleted file mode 100644 index 7583094a..00000000 Binary files a/test-toolkit/resources/mnist/data/training/MNIST/processed/training.pt and /dev/null differ diff --git a/test-toolkit/resources/mnist/data/training/model b/test-toolkit/resources/mnist/data/training/model deleted file mode 100644 index 393ea140..00000000 Binary files a/test-toolkit/resources/mnist/data/training/model and /dev/null differ diff --git a/test-toolkit/resources/mnist/mnist.py b/test-toolkit/resources/mnist/mnist.py deleted file mode 100644 index 1359f712..00000000 --- a/test-toolkit/resources/mnist/mnist.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import argparse -import logging -import os -import sys - -from sagemaker_training import environment -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.utils.data -import torch.utils.data.distributed -from torchvision import datasets, transforms - -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) -logger.addHandler(logging.StreamHandler(sys.stdout)) - - -# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py -class Net(nn.Module): - def __init__(self): - logger.info("Create neural network module") - - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x, dim=1) - - -def _get_train_data_loader(batch_size, training_dir, is_distributed, **kwargs): - logger.info("Get train data loader") - dataset = datasets.MNIST(training_dir, train=True, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) if is_distributed else None - return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=train_sampler is None, - sampler=train_sampler, **kwargs) - - -def _get_test_data_loader(test_batch_size, training_dir, **kwargs): - logger.info("Get test data loader") - return torch.utils.data.DataLoader( - datasets.MNIST(training_dir, train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=test_batch_size, shuffle=True, **kwargs) - - -def _average_gradients(model): - # Gradient averaging. - size = float(dist.get_world_size()) - for param in model.parameters(): - dist.all_reduce(param.grad.data, op=dist.reduce_op.SUM) - param.grad.data /= size - - -def train(args): - is_distributed = len(args.hosts) > 1 and args.backend is not None - logger.debug("Distributed training - {}".format(is_distributed)) - use_cuda = (args.processor == 'gpu') or (args.num_gpus > 0) - logger.debug("Number of gpus available - {}".format(args.num_gpus)) - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} - device = torch.device("cuda" if use_cuda else "cpu") - - if is_distributed: - # Initialize the distributed environment. - world_size = len(args.hosts) - os.environ['WORLD_SIZE'] = str(world_size) - host_rank = args.hosts.index(args.current_host) - os.environ['RANK'] = str(host_rank) - dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size) - logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format( - args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format( - dist.get_rank(), args.num_gpus)) - - # set the seed for generating random numbers - torch.manual_seed(args.seed) - if use_cuda: - torch.cuda.manual_seed(args.seed) - - train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs) - test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) - - # TODO: assert the logs when we move to the SDK local mode - logger.debug("Processes {}/{} ({:.0f}%) of train data".format( - len(train_loader.sampler), len(train_loader.dataset), - 100. * len(train_loader.sampler) / len(train_loader.dataset) - )) - - logger.debug("Processes {}/{} ({:.0f}%) of test data".format( - len(test_loader.sampler), len(test_loader.dataset), - 100. * len(test_loader.sampler) / len(test_loader.dataset) - )) - - model = Net().to(device) - if is_distributed and use_cuda: - # multi-machine multi-gpu case - logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.") - model = torch.nn.parallel.DistributedDataParallel(model) - elif use_cuda: - # single-machine multi-gpu case - logger.debug("Single-machine multi-gpu: using DataParallel().cuda().") - model = torch.nn.DataParallel(model).to(device) - else: - # single-machine or multi-machine cpu case - logger.debug("Single-machine/multi-machine cpu: using DataParallel.") - model = torch.nn.DataParallel(model) - - optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) - - for epoch in range(1, args.epochs + 1): - model.train() - for batch_idx, (data, target) in enumerate(train_loader, 1): - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - if is_distributed and not use_cuda: - # average gradients manually for multi-machine cpu case only - _average_gradients(model) - optimizer.step() - if batch_idx % args.log_interval == 0: - logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.sampler), - 100. * batch_idx / len(train_loader), loss.item())) - test(model, test_loader, device) - save_model(model, args.model_dir) - - -def test(model, test_loader, device): - model.eval() - test_loss = 0 - correct = 0 - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += F.nll_loss(output, target, size_average=None).item() # sum up batch loss - pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() - - test_loss /= len(test_loader.dataset) - logger.debug('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( - test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - - -def save_model(model, model_dir): - logger.info("Saving the model.") - path = os.path.join(model_dir, 'model.pth') - # recommended way from http://pytorch.org/docs/master/notes/serialization.html - torch.save(model.state_dict(), path) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - - # Data and model checkpoints directories - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=1, metavar='N', - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=100, metavar='N', - help='how many batches to wait before logging training status') - parser.add_argument('--backend', type=str, default=None, - help='backend for distributed training') - parser.add_argument('--processor', type=str, default='cpu', - help='backend for distributed training') - - # Container environment - env = environment.Environment() - parser.add_argument('--hosts', type=list, default=env.hosts) - parser.add_argument('--current-host', type=str, default=env.current_host) - parser.add_argument('--model-dir', type=str, default=env.model_dir) - parser.add_argument('--data-dir', type=str, default=env.channel_input_dirs['training']) - parser.add_argument('--num-gpus', type=int, default=env.num_gpus) - - train(parser.parse_args()) diff --git a/test-toolkit/utils/__init__.py b/test-toolkit/utils/__init__.py deleted file mode 100644 index 79cb9cdf..00000000 --- a/test-toolkit/utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import diff --git a/test-toolkit/utils/local_mode_utils.py b/test-toolkit/utils/local_mode_utils.py deleted file mode 100644 index acd1197e..00000000 --- a/test-toolkit/utils/local_mode_utils.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -from contextlib import contextmanager -import fcntl -import os -import tarfile -import time - -from integration import resources_path - -LOCK_PATH = os.path.join(resources_path, 'local_mode_lock') - - -@contextmanager -def lock(): - # Since Local Mode uses the same port for serving, we need a lock in order - # to allow concurrent test execution. - local_mode_lock_fd = open(LOCK_PATH, 'w') - local_mode_lock = local_mode_lock_fd.fileno() - - fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) - - try: - yield - finally: - time.sleep(5) - fcntl.lockf(local_mode_lock, fcntl.LOCK_UN) - - -def assert_files_exist(output_path, directory_file_map): - for directory, files in directory_file_map.items(): - with tarfile.open(os.path.join(output_path, '{}.tar.gz'.format(directory))) as tar: - for f in files: - tar.getmember(f) diff --git a/test/__init__.py b/test/__init__.py deleted file mode 100644 index 79cb9cdf..00000000 --- a/test/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import diff --git a/test/conftest.py b/test/conftest.py index 5609d0fb..13e28de3 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -22,7 +22,8 @@ import tempfile from sagemaker import LocalSession, Session -from sagemaker.pytorch import PyTorch + +from utils import image_utils logger = logging.getLogger(__name__) logging.getLogger('boto').setLevel(logging.INFO) @@ -42,17 +43,33 @@ def pytest_addoption(parser): - parser.addoption('--aws-id') + parser.addoption('--build-image', '-B', action='store_true') + parser.addoption('--push-image', '-P', action='store_true') + parser.addoption('--dockerfile-type', '-T', choices=['dlc.cpu', 'dlc.gpu', 'pytorch'], + default='pytorch') + parser.addoption('--dockerfile', '-D', default=None) + parser.addoption('--aws-id', default=None) parser.addoption('--instance-type') - parser.addoption('--docker-base-name', default='pytorch') + parser.addoption('--docker-base-name', default='sagemaker-pytorch-training') parser.addoption('--region', default='us-west-2') - parser.addoption('--framework-version', default=PyTorch.LATEST_VERSION) + parser.addoption('--framework-version', default="1.4.0") parser.addoption('--py-version', choices=['2', '3'], default=str(sys.version_info.major)) parser.addoption('--processor', choices=['gpu', 'cpu'], default='cpu') # If not specified, will default to {framework-version}-{processor}-py{py-version} parser.addoption('--tag', default=None) +@pytest.fixture(scope='session', name='dockerfile_type') +def fixture_dockerfile_type(request): + return request.config.getoption('--dockerfile-type') + + +@pytest.fixture(scope='session', name='dockerfile') +def fixture_dockerfile(request, dockerfile_type): + dockerfile = request.config.getoption('--dockerfile') + return dockerfile if dockerfile else 'Dockerfile.{}'.format(dockerfile_type) + + @pytest.fixture(scope='session', name='docker_base_name') def fixture_docker_base_name(request): return request.config.getoption('--docker-base-name') @@ -85,11 +102,6 @@ def fixture_tag(request, framework_version, processor, py_version): return provided_tag if provided_tag else default_tag -@pytest.fixture(scope='session', name='docker_image') -def fixture_docker_image(docker_base_name, tag): - return '{}:{}'.format(docker_base_name, tag) - - @pytest.fixture def opt_ml(): tmp = tempfile.mkdtemp() @@ -108,6 +120,27 @@ def fixture_use_gpu(processor): return processor == 'gpu' +@pytest.fixture(scope='session', name='build_image', autouse=True) +def fixture_build_image(request, framework_version, dockerfile, image_uri, region): + build_image = request.config.getoption('--build-image') + if build_image: + return image_utils.build_image(framework_version=framework_version, + dockerfile=dockerfile, + image_uri=image_uri, + region=region, + cwd=os.path.join(dir_path, '..')) + + return image_uri + + +@pytest.fixture(scope='session', name='push_image', autouse=True) +def fixture_push_image(request, image_uri, region, aws_id): + push_image = request.config.getoption('--push-image') + if push_image: + return image_utils.push_image(image_uri, region, aws_id) + return None + + @pytest.fixture(scope='session', name='sagemaker_session') def fixture_sagemaker_session(region): return Session(boto_session=boto3.Session(region_name=region)) @@ -132,12 +165,14 @@ def fixture_instance_type(request, processor): @pytest.fixture(name='docker_registry', scope='session') def fixture_docker_registry(aws_id, region): - return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region) + return '{}.dkr.ecr.{}.amazonaws.com'.format(aws_id, region) if aws_id else None -@pytest.fixture(name='ecr_image', scope='session') -def fixture_ecr_image(docker_registry, docker_base_name, tag): - return '{}/{}:{}'.format(docker_registry, docker_base_name, tag) +@pytest.fixture(name='image_uri', scope='session') +def fixture_image_uri(docker_registry, docker_base_name, tag): + if docker_registry: + return '{}/{}:{}'.format(docker_registry, docker_base_name, tag) + return '{}:{}'.format(docker_base_name, tag) @pytest.fixture(scope='session', name='dist_cpu_backend', params=['gloo']) diff --git a/test-toolkit/docker/1.4.0/Dockerfile.dlc.cpu b/test/container/1.4.0/Dockerfile.dlc.cpu similarity index 100% rename from test-toolkit/docker/1.4.0/Dockerfile.dlc.cpu rename to test/container/1.4.0/Dockerfile.dlc.cpu diff --git a/test-toolkit/docker/1.4.0/Dockerfile.dlc.gpu b/test/container/1.4.0/Dockerfile.dlc.gpu similarity index 100% rename from test-toolkit/docker/1.4.0/Dockerfile.dlc.gpu rename to test/container/1.4.0/Dockerfile.dlc.gpu diff --git a/test-toolkit/docker/1.4.0/Dockerfile.pytorch b/test/container/1.4.0/Dockerfile.pytorch similarity index 100% rename from test-toolkit/docker/1.4.0/Dockerfile.pytorch rename to test/container/1.4.0/Dockerfile.pytorch diff --git a/test/integration/__init__.py b/test/integration/__init__.py index b428aaeb..e879ae0e 100644 --- a/test/integration/__init__.py +++ b/test/integration/__init__.py @@ -32,6 +32,9 @@ model_gpu_1d_dir = os.path.join(model_gpu_dir, '1d') call_model_fn_once_script = os.path.join(resources_path, 'call_model_fn_once.py') +requirements_dir = os.path.join(resources_path, 'requirements') +requirements_script = 'entry.py' + ROLE = 'dummy/unused-role' DEFAULT_TIMEOUT = 20 PYTHON3 = 'py3' diff --git a/test/integration/local/test_distributed_training.py b/test/integration/local/test_distributed_training.py index b2b2504f..553110a8 100644 --- a/test/integration/local/test_distributed_training.py +++ b/test/integration/local/test_distributed_training.py @@ -17,8 +17,8 @@ import pytest from sagemaker.pytorch import PyTorch -from test.integration import data_dir, dist_operations_path, mnist_script, ROLE -from test.utils.local_mode_utils import assert_files_exist +from integration import data_dir, dist_operations_path, mnist_script, ROLE +from utils.local_mode_utils import assert_files_exist MODEL_SUCCESS_FILES = { 'model': ['success'], @@ -32,10 +32,10 @@ def fixture_dist_gpu_backend(request): @pytest.mark.skip_gpu -def test_dist_operations_path_cpu(docker_image, dist_cpu_backend, sagemaker_local_session, tmpdir): +def test_dist_operations_path_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=dist_operations_path, role=ROLE, - image_name=docker_image, + image_name=image_uri, train_instance_count=2, train_instance_type='local', sagemaker_session=sagemaker_local_session, @@ -46,10 +46,10 @@ def test_dist_operations_path_cpu(docker_image, dist_cpu_backend, sagemaker_loca @pytest.mark.skip_cpu -def test_dist_operations_path_gpu_nccl(docker_image, sagemaker_local_session, tmpdir): +def test_dist_operations_path_gpu_nccl(image_uri, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=dist_operations_path, role=ROLE, - image_name=docker_image, + image_name=image_uri, train_instance_count=1, train_instance_type='local_gpu', sagemaker_session=sagemaker_local_session, @@ -60,10 +60,10 @@ def test_dist_operations_path_gpu_nccl(docker_image, sagemaker_local_session, tm @pytest.mark.skip_gpu -def test_cpu_nccl(docker_image, sagemaker_local_session, tmpdir): +def test_cpu_nccl(image_uri, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=docker_image, + image_name=image_uri, train_instance_count=2, train_instance_type='local', sagemaker_session=sagemaker_local_session, @@ -78,10 +78,10 @@ def test_cpu_nccl(docker_image, sagemaker_local_session, tmpdir): @pytest.mark.skip_gpu -def test_mnist_cpu(docker_image, dist_cpu_backend, sagemaker_local_session, tmpdir): +def test_mnist_cpu(image_uri, dist_cpu_backend, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=docker_image, + image_name=image_uri, train_instance_count=2, train_instance_type='local', sagemaker_session=sagemaker_local_session, diff --git a/test-toolkit/integration/local/test_requirements.py b/test/integration/local/test_requirements.py similarity index 100% rename from test-toolkit/integration/local/test_requirements.py rename to test/integration/local/test_requirements.py diff --git a/test/integration/local/test_single_machine_training.py b/test/integration/local/test_single_machine_training.py index a4852e23..68f99859 100644 --- a/test/integration/local/test_single_machine_training.py +++ b/test/integration/local/test_single_machine_training.py @@ -14,17 +14,16 @@ import os -import pytest from sagemaker.pytorch import PyTorch -from test.utils.local_mode_utils import assert_files_exist -from test.integration import data_dir, fastai_path, fastai_mnist_script, mnist_script, PYTHON3, ROLE +from utils.local_mode_utils import assert_files_exist +from integration import data_dir, mnist_script, ROLE -def test_mnist(docker_image, processor, instance_type, sagemaker_local_session, tmpdir): +def test_mnist(image_uri, processor, instance_type, sagemaker_local_session, tmpdir): estimator = PyTorch(entry_point=mnist_script, role=ROLE, - image_name=docker_image, + image_name=image_uri, train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_local_session, @@ -34,22 +33,6 @@ def test_mnist(docker_image, processor, instance_type, sagemaker_local_session, _train_and_assert_success(estimator, data_dir, str(tmpdir)) -def test_fastai_mnist(docker_image, instance_type, py_version, sagemaker_local_session, tmpdir): - if py_version != PYTHON3: - pytest.skip('Skipping the test because fastai supports >= Python 3.6.') - - estimator = PyTorch(entry_point=fastai_mnist_script, - role=ROLE, - image_name=docker_image, - train_instance_count=1, - train_instance_type=instance_type, - sagemaker_session=sagemaker_local_session, - output_path='file://{}'.format(tmpdir)) - - input_dir = os.path.join(fastai_path, 'mnist_tiny') - _train_and_assert_success(estimator, input_dir, str(tmpdir)) - - def _train_and_assert_success(estimator, input_dir, output_path): estimator.fit({'training': 'file://{}'.format(os.path.join(input_dir, 'training'))}) diff --git a/test/integration/sagemaker/test_dgl.py b/test/integration/sagemaker/test_dgl.py deleted file mode 100644 index f0eb06e4..00000000 --- a/test/integration/sagemaker/test_dgl.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# or in the "license" file accompanying this file. This file is distributed -# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language governing -# permissions and limitations under the License. -from __future__ import absolute_import - -import os - -import pytest -from sagemaker import utils -from sagemaker.pytorch import PyTorch - -from test.integration import resources_path, DEFAULT_TIMEOUT -from test.integration.sagemaker.timeout import timeout - -DGL_DATA_PATH = os.path.join(resources_path, 'dgl-gcn') -DGL_SCRIPT_PATH = os.path.join(DGL_DATA_PATH, 'gcn.py') - - -@pytest.mark.skip_gpu -@pytest.mark.skip_py2_containers -def test_dgl_gcn_training_cpu(sagemaker_session, ecr_image, instance_type): - instance_type = instance_type or 'ml.c4.xlarge' - _test_dgl_training(sagemaker_session, ecr_image, instance_type) - - -@pytest.mark.skip_cpu -@pytest.mark.skip_py2_containers -def test_dgl_gcn_training_gpu(sagemaker_session, ecr_image, instance_type): - instance_type = instance_type or 'ml.p2.xlarge' - _test_dgl_training(sagemaker_session, ecr_image, instance_type) - - -def _test_dgl_training(sagemaker_session, ecr_image, instance_type): - dgl = PyTorch(entry_point=DGL_SCRIPT_PATH, - role='SageMakerRole', - train_instance_count=1, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=ecr_image) - with timeout(minutes=DEFAULT_TIMEOUT): - job_name = utils.unique_name_from_base('test-pytorch-dgl-image') - dgl.fit(job_name=job_name) diff --git a/test/integration/sagemaker/test_distributed_operations.py b/test/integration/sagemaker/test_distributed_operations.py index 3f05350d..6e072598 100644 --- a/test/integration/sagemaker/test_distributed_operations.py +++ b/test/integration/sagemaker/test_distributed_operations.py @@ -20,9 +20,8 @@ from sagemaker.pytorch import PyTorch from six.moves.urllib.parse import urlparse -from test.integration import (data_dir, dist_operations_path, fastai_path, mnist_script, - DEFAULT_TIMEOUT) -from test.integration.sagemaker.timeout import timeout +from integration import data_dir, dist_operations_path, mnist_script, DEFAULT_TIMEOUT +from integration.sagemaker.timeout import timeout MULTI_GPU_INSTANCE = 'ml.p3.8xlarge' @@ -30,75 +29,50 @@ @pytest.mark.skip_gpu @pytest.mark.deploy_test @pytest.mark.skip_test_in_region -def test_dist_operations_cpu(sagemaker_session, ecr_image, instance_type, dist_cpu_backend): +def test_dist_operations_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend): instance_type = instance_type or 'ml.c4.xlarge' - _test_dist_operations(sagemaker_session, ecr_image, instance_type, dist_cpu_backend) + _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_cpu_backend) @pytest.mark.skip_cpu @pytest.mark.deploy_test -def test_dist_operations_gpu(sagemaker_session, instance_type, ecr_image, dist_gpu_backend): +def test_dist_operations_gpu(sagemaker_session, instance_type, image_uri, dist_gpu_backend): instance_type = instance_type or 'ml.p2.xlarge' - _test_dist_operations(sagemaker_session, ecr_image, instance_type, dist_gpu_backend) + _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_gpu_backend) @pytest.mark.skip_cpu -def test_dist_operations_multi_gpu(sagemaker_session, ecr_image, dist_gpu_backend): - _test_dist_operations(sagemaker_session, ecr_image, MULTI_GPU_INSTANCE, dist_gpu_backend, 1) +def test_dist_operations_multi_gpu(sagemaker_session, image_uri, dist_gpu_backend): + _test_dist_operations(sagemaker_session, image_uri, MULTI_GPU_INSTANCE, dist_gpu_backend, 1) @pytest.mark.skip_cpu @pytest.mark.skip_py2_containers -def test_dist_operations_fastai_gpu(sagemaker_session, ecr_image): - with timeout(minutes=DEFAULT_TIMEOUT): - pytorch = PyTorch(entry_point='train_cifar.py', - source_dir=os.path.join(fastai_path, 'cifar'), - role='SageMakerRole', - train_instance_count=1, - train_instance_type=MULTI_GPU_INSTANCE, - sagemaker_session=sagemaker_session, - image_name=ecr_image) - - pytorch.sagemaker_session.default_bucket() - training_input = pytorch.sagemaker_session.upload_data( - path=os.path.join(fastai_path, 'cifar_tiny', 'training'), - key_prefix='pytorch/distributed_operations' - ) - - job_name = utils.unique_name_from_base('test-pytorch-dist-ops') - pytorch.fit({'training': training_input}, job_name=job_name) - - model_s3_url = pytorch.create_model().model_data - _assert_s3_file_exists(sagemaker_session.boto_region_name, model_s3_url) - - -@pytest.mark.skip_cpu -@pytest.mark.skip_py2_containers -def test_mnist_gpu(sagemaker_session, ecr_image, dist_gpu_backend): +def test_mnist_gpu(sagemaker_session, image_uri, dist_gpu_backend): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=2, - image_name=ecr_image, + image_name=image_uri, train_instance_type=MULTI_GPU_INSTANCE, sagemaker_session=sagemaker_session, hyperparameters={'backend': dist_gpu_backend}) training_input = sagemaker_session.upload_data(path=os.path.join(data_dir, 'training'), key_prefix='pytorch/mnist') - job_name = utils.unique_name_from_base('test-pytorch-dist-ops') + job_name = utils.unique_name_from_base('test-pytorch-dist-ops') pytorch.fit({'training': training_input}, job_name=job_name) -def _test_dist_operations(sagemaker_session, ecr_image, instance_type, dist_backend, train_instance_count=3): +def _test_dist_operations(sagemaker_session, image_uri, instance_type, dist_backend, train_instance_count=3): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=dist_operations_path, role='SageMakerRole', train_instance_count=train_instance_count, train_instance_type=instance_type, sagemaker_session=sagemaker_session, - image_name=ecr_image, + image_name=image_uri, hyperparameters={'backend': dist_backend}) pytorch.sagemaker_session.default_bucket() diff --git a/test/integration/sagemaker/test_experiments.py b/test/integration/sagemaker/test_experiments.py deleted file mode 100644 index 33fd63ec..00000000 --- a/test/integration/sagemaker/test_experiments.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# or in the "license" file accompanying this file. This file is distributed -# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language governing -# permissions and limitations under the License. -from __future__ import absolute_import - -import time - -import pytest -from sagemaker.pytorch import PyTorch -from sagemaker import utils -from test.integration import training_dir, smdebug_mnist_script, DEFAULT_TIMEOUT -from test.integration.sagemaker.timeout import timeout - - -@pytest.mark.skip_py2_containers -def test_training(sagemaker_session, ecr_image, instance_type): - - from smexperiments.experiment import Experiment - from smexperiments.trial import Trial - from smexperiments.trial_component import TrialComponent - - sm_client = sagemaker_session.sagemaker_client - - experiment_name = "pytorch-container-integ-test-{}".format(int(time.time())) - - experiment = Experiment.create( - experiment_name=experiment_name, - description="Integration test full customer e2e from sagemaker-pytorch-container", - sagemaker_boto_client=sm_client, - ) - - trial_name = "pytorch-container-integ-test-{}".format(int(time.time())) - trial = Trial.create( - experiment_name=experiment_name, trial_name=trial_name, sagemaker_boto_client=sm_client - ) - - hyperparameters = { - "random_seed": True, - "num_steps": 50, - "smdebug_path": "/opt/ml/output/tensors", - "epochs": 1, - "data_dir": training_dir, - } - - training_job_name = utils.unique_name_from_base("test-pytorch-experiments-image") - - # create a training job and wait for it to complete - with timeout(minutes=DEFAULT_TIMEOUT): - pytorch = PyTorch( - entry_point=smdebug_mnist_script, - role="SageMakerRole", - train_instance_count=1, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=ecr_image, - hyperparameters=hyperparameters, - ) - training_input = pytorch.sagemaker_session.upload_data( - path=training_dir, key_prefix="pytorch/mnist" - ) - pytorch.fit({"training": training_input}, job_name=training_job_name) - - training_job = sm_client.describe_training_job(TrainingJobName=training_job_name) - training_job_arn = training_job["TrainingJobArn"] - - # verify trial component auto created from the training job - trial_components = list( - TrialComponent.list(source_arn=training_job_arn, sagemaker_boto_client=sm_client) - ) - - trial_component_summary = trial_components[0] - trial_component = TrialComponent.load( - trial_component_name=trial_component_summary.trial_component_name, - sagemaker_boto_client=sm_client, - ) - - # associate the trial component with the trial - trial.add_trial_component(trial_component) - - # cleanup - trial.remove_trial_component(trial_component_summary.trial_component_name) - trial_component.delete() - trial.delete() - experiment.delete() diff --git a/test/integration/sagemaker/test_mnist.py b/test/integration/sagemaker/test_mnist.py index 33b5d814..0a7fec96 100644 --- a/test/integration/sagemaker/test_mnist.py +++ b/test/integration/sagemaker/test_mnist.py @@ -16,34 +16,34 @@ from sagemaker import utils from sagemaker.pytorch import PyTorch -from test.integration import training_dir, mnist_script, DEFAULT_TIMEOUT -from test.integration.sagemaker.timeout import timeout +from integration import training_dir, mnist_script, DEFAULT_TIMEOUT +from integration.sagemaker.timeout import timeout @pytest.mark.skip_gpu -def test_mnist_distributed_cpu(sagemaker_session, ecr_image, instance_type, dist_cpu_backend): +def test_mnist_distributed_cpu(sagemaker_session, image_uri, instance_type, dist_cpu_backend): instance_type = instance_type or 'ml.c4.xlarge' - _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_cpu_backend) + _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_cpu_backend) @pytest.mark.skip_cpu -def test_mnist_distributed_gpu(sagemaker_session, ecr_image, instance_type, dist_gpu_backend): +def test_mnist_distributed_gpu(sagemaker_session, image_uri, instance_type, dist_gpu_backend): instance_type = instance_type or 'ml.p2.xlarge' - _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_gpu_backend) + _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_gpu_backend) -def _test_mnist_distributed(sagemaker_session, ecr_image, instance_type, dist_backend): +def _test_mnist_distributed(sagemaker_session, image_uri, instance_type, dist_backend): with timeout(minutes=DEFAULT_TIMEOUT): pytorch = PyTorch(entry_point=mnist_script, role='SageMakerRole', train_instance_count=2, train_instance_type=instance_type, sagemaker_session=sagemaker_session, - image_name=ecr_image, + image_name=image_uri, hyperparameters={'backend': dist_backend, 'epochs': 2}) - training_input = pytorch.sagemaker_session.upload_data(path=training_dir, key_prefix='pytorch/mnist') + job_name = utils.unique_name_from_base('test-pytorch-mnist') pytorch.fit({'training': training_input}, job_name=job_name) diff --git a/test/integration/sagemaker/test_training_smdebug.py b/test/integration/sagemaker/test_training_smdebug.py deleted file mode 100644 index 604e7cec..00000000 --- a/test/integration/sagemaker/test_training_smdebug.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). -# You may not use this file except in compliance with the License. -# A copy of the License is located at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# or in the "license" file accompanying this file. This file is distributed -# on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either -# express or implied. See the License for the specific language governing -# permissions and limitations under the License. -from __future__ import absolute_import - -import pytest -from sagemaker import utils -from sagemaker.pytorch import PyTorch - -from test.integration import training_dir, smdebug_mnist_script, DEFAULT_TIMEOUT -from test.integration.sagemaker.timeout import timeout - - -@pytest.mark.skip_py2_containers -def test_training_smdebug(sagemaker_session, ecr_image, instance_type): - hyperparameters = {'random_seed': True, 'num_steps': 50, 'smdebug_path': '/opt/ml/output/tensors', 'epochs': 1, - 'data_dir': training_dir} - - with timeout(minutes=DEFAULT_TIMEOUT): - pytorch = PyTorch(entry_point=smdebug_mnist_script, - role='SageMakerRole', - train_instance_count=1, - train_instance_type=instance_type, - sagemaker_session=sagemaker_session, - image_name=ecr_image, - hyperparameters=hyperparameters) - - training_input = pytorch.sagemaker_session.upload_data(path=training_dir, - key_prefix='pytorch/mnist') - job_name = utils.unique_name_from_base('test-pytorch-smdebug') - - pytorch.fit({'training': training_input}, job_name=job_name) diff --git a/test/resources/dgl-gcn/gcn.py b/test/resources/dgl-gcn/gcn.py deleted file mode 100644 index a0330d31..00000000 --- a/test/resources/dgl-gcn/gcn.py +++ /dev/null @@ -1,197 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -"""GCN using DGL nn package - -References: -- Semi-Supervised Classification with Graph Convolutional Networks -- Paper: https://arxiv.org/abs/1609.02907 -- Code: https://github.com/tkipf/gcn -""" -import os -import argparse, time -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F - -from dgl import DGLGraph -from dgl.nn.pytorch import GraphConv -from dgl.data import register_data_args, load_data - -class GCN(nn.Module): - def __init__(self, - g, - in_feats, - n_hidden, - n_classes, - n_layers, - activation, - dropout): - super(GCN, self).__init__() - self.g = g - self.layers = nn.ModuleList() - # input layer - self.layers.append(GraphConv(in_feats, n_hidden, activation=activation)) - # hidden layers - for i in range(n_layers - 1): - self.layers.append(GraphConv(n_hidden, n_hidden, activation=activation)) - # output layer - self.layers.append(GraphConv(n_hidden, n_classes)) - self.dropout = nn.Dropout(p=dropout) - - def forward(self, features): - h = features - for i, layer in enumerate(self.layers): - if i != 0: - h = self.dropout(h) - h = layer(self.g, h) - return h - -def evaluate(model, features, labels, mask): - model.eval() - with torch.no_grad(): - logits = model(features) - logits = logits[mask] - labels = labels[mask] - _, indices = torch.max(logits, dim=1) - correct = torch.sum(indices == labels) - return correct.item() * 1.0 / len(labels) - -def main(args): - # load and preprocess dataset - data = load_data(args) - features = torch.FloatTensor(data.features) - labels = torch.LongTensor(data.labels) - if hasattr(torch, 'BoolTensor'): - train_mask = torch.BoolTensor(data.train_mask) - val_mask = torch.BoolTensor(data.val_mask) - test_mask = torch.BoolTensor(data.test_mask) - else: - train_mask = torch.ByteTensor(data.train_mask) - val_mask = torch.ByteTensor(data.val_mask) - test_mask = torch.ByteTensor(data.test_mask) - in_feats = features.shape[1] - n_classes = data.num_labels - n_edges = data.graph.number_of_edges() - print("""----Data statistics------' - #Edges %d - #Classes %d - #Train samples %d - #Val samples %d - #Test samples %d""" % - (n_edges, n_classes, - train_mask.sum().item(), - val_mask.sum().item(), - test_mask.sum().item())) - - if args.gpu < 0: - cuda = False - else: - cuda = True - torch.cuda.set_device(args.gpu) - features = features.cuda() - labels = labels.cuda() - train_mask = train_mask.cuda() - val_mask = val_mask.cuda() - test_mask = test_mask.cuda() - - # graph preprocess and calculate normalization factor - g = data.graph - # add self loop - if args.self_loop: - g.remove_edges_from(g.selfloop_edges()) - g.add_edges_from(zip(g.nodes(), g.nodes())) - g = DGLGraph(g) - n_edges = g.number_of_edges() - # normalization - degs = g.in_degrees().float() - norm = torch.pow(degs, -0.5) - norm[torch.isinf(norm)] = 0 - if cuda: - norm = norm.cuda() - g.ndata['norm'] = norm.unsqueeze(1) - - # create GCN model - model = GCN(g, - in_feats, - args.n_hidden, - n_classes, - args.n_layers, - F.relu, - args.dropout) - - if cuda: - model.cuda() - loss_fcn = torch.nn.CrossEntropyLoss() - - # use optimizer - optimizer = torch.optim.Adam(model.parameters(), - lr=args.lr, - weight_decay=args.weight_decay) - - # initialize graph - dur = [] - for epoch in range(args.n_epochs): - model.train() - if epoch >= 3: - t0 = time.time() - # forward - logits = model(features) - loss = loss_fcn(logits[train_mask], labels[train_mask]) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - - if epoch >= 3: - dur.append(time.time() - t0) - - acc = evaluate(model, features, labels, val_mask) - print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " - "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item(), - acc, n_edges / np.mean(dur) / 1000)) - - acc = evaluate(model, features, labels, test_mask) - print("Test accuracy {:.2%}".format(acc)) - - acc_threshold = 0.75 - assert acc > acc_threshold, "Accuracy {} is not greater than threshold {}".format(acc, acc_threshold) - -def parse_args(): - parser = argparse.ArgumentParser(description='GCN') - register_data_args(parser) - parser.add_argument("--dropout", type=float, default=0.5, - help="dropout probability") - parser.add_argument("--gpu", type=int, default=-1, - help="gpu") - parser.add_argument("--lr", type=float, default=1e-2, - help="learning rate") - parser.add_argument("--n-epochs", type=int, default=200, - help="number of training epochs") - parser.add_argument("--n-hidden", type=int, default=16, - help="number of hidden gcn units") - parser.add_argument("--n-layers", type=int, default=1, - help="number of hidden gcn layers") - parser.add_argument("--weight-decay", type=float, default=5e-4, - help="Weight for L2 loss") - parser.add_argument("--self-loop", action='store_true', - help="graph self-loop (default=False)") - parser.set_defaults(self_loop=False) - - return parser.parse_args() - -if __name__ == '__main__': - num_gpus = int(os.environ['SM_NUM_GPUS']) - if num_gpus > 0: - gpu = 0 - else: - gpu = -1 - - args = parse_args() - args.gpu = gpu - args.dataset = 'cora' - print(args) - - main(args) - diff --git a/test/resources/fastai/cifar/cifar.py b/test/resources/fastai/cifar/cifar.py deleted file mode 100644 index e41fbfde..00000000 --- a/test/resources/fastai/cifar/cifar.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. - -# Based on https://github.com/fastai/fastai/blob/master/examples/train_cifar.py -# imports and the code was as much preserved to match the official example -from fastai.script import * -from fastai.vision import * -from fastai.vision.models.wrn import wrn_22 -from fastai.distributed import * -import torch - -torch.backends.cudnn.benchmark = True - -@call_parse -def main(gpu:Param("GPU to run on", str)=None): - """Distrubuted training of CIFAR-10. - Fastest speed is if you run as follows: - python -m fastai.launch cifar.py""" - gpu = setup_distrib(gpu) - n_gpus = int(os.environ.get("WORLD_SIZE", 1)) - tgz_path = os.environ.get('SM_CHANNEL_TRAINING') - path = os.path.join(tgz_path, 'cifar10_tiny') - tarfile.open(f'{path}.tgz', 'r:gz').extractall(tgz_path) - ds_tfms = ([*rand_pad(4, 32), flip_lr(p=0.5)], []) - workers = min(16, num_cpus()//n_gpus) - data = ImageDataBunch.from_folder(path, valid='test', ds_tfms=ds_tfms, bs=512//n_gpus, - num_workers=workers) - data.normalize(cifar_stats) - learn = Learner(data, wrn_22(), metrics=accuracy, path='/opt/ml', model_dir='model') - if gpu is None: - learn.model = nn.DataParallel(learn.model) - else: - learn.to_distributed(gpu) - learn.to_fp16() - learn.fit_one_cycle(2, 3e-3, wd=0.4) - learn.save('model') diff --git a/test/resources/fastai/cifar/train_cifar.py b/test/resources/fastai/cifar/train_cifar.py deleted file mode 100644 index febac46b..00000000 --- a/test/resources/fastai/cifar/train_cifar.py +++ /dev/null @@ -1,6 +0,0 @@ - -# TODO: replace with a shell script when it's supported by the container -import os - -# Using system instead of subprocess as an easier way to simulate shell script call -os.system('python -m fastai.launch cifar.py') diff --git a/test/resources/fastai/cifar_tiny/training/cifar10_tiny.tgz b/test/resources/fastai/cifar_tiny/training/cifar10_tiny.tgz deleted file mode 100644 index 7ce6d275..00000000 Binary files a/test/resources/fastai/cifar_tiny/training/cifar10_tiny.tgz and /dev/null differ diff --git a/test/resources/fastai/mnist.py b/test/resources/fastai/mnist.py deleted file mode 100644 index cf806ee8..00000000 --- a/test/resources/fastai/mnist.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. - -# Based on https://github.com/fastai/fastai/blob/master/examples/train_mnist.py -# imports and the code was as much preserved to match the official example -from fastai.script import * -from fastai.vision import * - -@call_parse -def main(): - tgz_path = os.environ.get('SM_CHANNEL_TRAINING') - path = os.path.join(tgz_path, 'mnist_tiny') - tarfile.open(f'{path}.tgz', 'r:gz').extractall(tgz_path) - tfms = (rand_pad(2, 28), []) - data = ImageDataBunch.from_folder(path, ds_tfms=tfms, bs=64) - data.normalize(imagenet_stats) - learn = create_cnn(data, models.resnet18, metrics=accuracy, path='/opt/ml', model_dir='model') - learn.fit_one_cycle(1, 0.02) - learn.save('model') diff --git a/test/resources/fastai/mnist_tiny/training/mnist_tiny.tgz b/test/resources/fastai/mnist_tiny/training/mnist_tiny.tgz deleted file mode 100644 index c24f3f03..00000000 Binary files a/test/resources/fastai/mnist_tiny/training/mnist_tiny.tgz and /dev/null differ diff --git a/test/resources/mnist/mnist.py b/test/resources/mnist/mnist.py index 221e94c5..1359f712 100644 --- a/test/resources/mnist/mnist.py +++ b/test/resources/mnist/mnist.py @@ -11,13 +11,13 @@ # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. from __future__ import absolute_import + import argparse import logging import os import sys -import cv2 as cv -import sagemaker_containers +from sagemaker_training import environment import torch import torch.distributed as dist import torch.nn as nn @@ -25,7 +25,6 @@ import torch.optim as optim import torch.utils.data import torch.utils.data.distributed - from torchvision import datasets, transforms logger = logging.getLogger(__name__) @@ -122,25 +121,18 @@ def train(args): 100. * len(test_loader.sampler) / len(test_loader.dataset) )) - model = Net() + model = Net().to(device) if is_distributed and use_cuda: # multi-machine multi-gpu case logger.debug("Multi-machine multi-gpu: using DistributedDataParallel.") - # establish host rank and set device on this node - torch.cuda.set_device(host_rank) - model.cuda(host_rank) - # for multiprocessing distributed, the DDP constructor should always set - # the single device scope. otherwise, DDP will use all available devices. - model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[host_rank], output_device=host_rank) + model = torch.nn.parallel.DistributedDataParallel(model) elif use_cuda: # single-machine multi-gpu case logger.debug("Single-machine multi-gpu: using DataParallel().cuda().") - model = model.to(device) model = torch.nn.DataParallel(model).to(device) else: # single-machine or multi-machine cpu case logger.debug("Single-machine/multi-machine cpu: using DataParallel.") - model = model.to(device) model = torch.nn.DataParallel(model) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) @@ -148,11 +140,7 @@ def train(args): for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): - if is_distributed and use_cuda: - # multi-machine multi-gpu case - allow asynchrous GPU copies of the data - data, target = data.cuda(non_blocking=True), target.cuda(non_blocking=True) - else: - data, target = data.to(device), target.to(device) + data, target = data.to(device), target.to(device) optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) @@ -168,9 +156,6 @@ def train(args): test(model, test_loader, device) save_model(model, args.model_dir) - if is_distributed and host_rank == 0 or not is_distributed: - assert_can_track_sagemaker_experiments() - def test(model, test_loader, device): model.eval() @@ -190,14 +175,6 @@ def test(model, test_loader, device): 100. * correct / len(test_loader.dataset))) -def model_fn(model_dir): - logger.info('model_fn') - model = torch.nn.DataParallel(Net()) - with open(os.path.join(model_dir, 'model.pth'), 'rb') as f: - model.load_state_dict(torch.load(f)) - return model - - def save_model(model, model_dir): logger.info("Saving the model.") path = os.path.join(model_dir, 'model.pth') @@ -205,22 +182,7 @@ def save_model(model, model_dir): torch.save(model.state_dict(), path) -def assert_can_track_sagemaker_experiments(): - in_sagemaker_training = 'TRAINING_JOB_ARN' in os.environ - in_python_three = sys.version_info[0] == 3 - - if in_sagemaker_training and in_python_three: - import smexperiments.tracker - - with smexperiments.tracker.Tracker.load() as tracker: - tracker.log_parameter('param', 1) - tracker.log_metric('metric', 1.0) - - if __name__ == '__main__': - # test opencv - print(cv.__version__) - parser = argparse.ArgumentParser() # Data and model checkpoints directories @@ -244,7 +206,7 @@ def assert_can_track_sagemaker_experiments(): help='backend for distributed training') # Container environment - env = sagemaker_containers.training_env() + env = environment.Environment() parser.add_argument('--hosts', type=list, default=env.hosts) parser.add_argument('--current-host', type=str, default=env.current_host) parser.add_argument('--model-dir', type=str, default=env.model_dir) @@ -252,4 +214,3 @@ def assert_can_track_sagemaker_experiments(): parser.add_argument('--num-gpus', type=int, default=env.num_gpus) train(parser.parse_args()) - diff --git a/test/resources/mnist/smdebug_mnist.py b/test/resources/mnist/smdebug_mnist.py deleted file mode 100644 index 4485baa0..00000000 --- a/test/resources/mnist/smdebug_mnist.py +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import -import argparse -import logging -import os -import sys - -import cv2 as cv -import sagemaker_containers -import torch -import torch.distributed as dist -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import torch.utils.data -import torch.utils.data.distributed -from torchvision import datasets, transforms -from smdebug.pytorch import * -import numpy as np -import random - -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) -logger.addHandler(logging.StreamHandler(sys.stdout)) - - -# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py -class Net(nn.Module): - def __init__(self): - logger.info("Create neural network module") - - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x, dim=1) - - -def parse_args(): - env = sagemaker_containers.training_env() - parser = argparse.ArgumentParser(description="PyTorch MNIST Example") - - parser.add_argument('--data_dir', type=str) - parser.add_argument("--batch-size", type=int, default=4, help="Batch size") - parser.add_argument("--epochs", type=int, default=1, help="Number of Epochs") - parser.add_argument( - "--smdebug_path", - type=str, - default=None, - help="S3 URI of the bucket where tensor data will be stored.", - ) - parser.add_argument("--learning_rate", type=float, default=0.1) - parser.add_argument("--momentum", type=float, default=0.9) - parser.add_argument("--random_seed", type=bool, default=False) - parser.add_argument( - "--num_steps", - type=int, - default=50, - help="Reduce the number of training " - "and evaluation steps to the give number if desired." - "If this is not passed, trains for one epoch " - "of training and validation data", - ) - parser.add_argument('--log_interval', type=int, default=100, metavar='N', - help='how many batches to wait before logging training status') - - opt = parser.parse_args() - return opt - - -def _get_train_data_loader(batch_size, training_dir): - logger.info("Get train data loader") - dataset = datasets.MNIST(training_dir, train=True, download=True, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - return torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, - num_workers=4) - - -def _get_test_data_loader(test_batch_size, training_dir): - logger.info("Get test data loader") - return torch.utils.data.DataLoader( - datasets.MNIST(training_dir, train=False, download=True, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=test_batch_size, shuffle=False, num_workers=4) - - -def create_smdebug_hook(output_s3_uri): - # With the following SaveConfig, we will save tensors for steps 1, 2 and 3 - # (indexing starts with 0). - save_config = SaveConfig(save_steps=[1, 2, 3]) - - # Create a hook that logs weights, biases and gradients while training the model. - hook = Hook( - out_dir=output_s3_uri, - save_config=save_config, - include_collections=["weights", "gradients", "losses"], - ) - return hook - - -def train(model, device, optimizer, hook, epochs, log_interval, training_dir): - criterion = nn.CrossEntropyLoss() - hook.register_loss(criterion) - - trainloader = _get_train_data_loader(4, training_dir) - validloader = _get_test_data_loader(4, training_dir) - - for epoch in range(epochs): - model.train() - hook.set_mode(modes.TRAIN) - for i, data in enumerate(trainloader): - inputs, labels = data - optimizer.zero_grad() - output = model(inputs) - loss = criterion(output, labels) - loss.backward() - optimizer.step() - - if i % log_interval == 0: - logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, i * len(data), len(trainloader.sampler), - 100. * i / len(trainloader), loss.item())) - - test(model, hook, validloader, device, criterion) - - -def test(model, hook, test_loader, device, loss_fn): - model.eval() - hook.set_mode(modes.EVAL) - test_loss = 0 - correct = 0 - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += loss_fn(output, target).item() # sum up batch loss - pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() - - test_loss /= len(test_loader.dataset) - logger.debug('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( - test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - - -def main(): - opt = parse_args() - - if opt.random_seed: - torch.manual_seed(128) - random.seed(12) - np.random.seed(2) - - device = torch.device("cpu") - out_dir = opt.smdebug_path - training_dir = opt.data_dir - hook = create_smdebug_hook(out_dir) - model = Net().to(device) - hook.register_hook(model) - optimizer = optim.SGD(model.parameters(), lr=opt.learning_rate, momentum=opt.momentum) - train(model, device, optimizer, hook, opt.epochs, opt.log_interval, training_dir) - print("Training is complete") - - from smdebug.trials import create_trial - print("Created the trial with out_dir {0}".format(out_dir)) - trial = create_trial(out_dir) - assert trial - print("Train steps: " + str(trial.steps(mode=modes.TRAIN))) - print("Eval steps: " + str(trial.steps(mode=modes.EVAL))) - - print( - f"trial.tensor_names() = {trial.tensor_names()}" - ) - # if loss collection tensors not in in trial.tensor_names() - # means they were not saved - - print(f"collection_manager = {hook.collection_manager}") - - weights_tensors = hook.collection_manager.get("weights").tensor_names - print(f"'weights' collection tensors = {weights_tensors}") - assert len(weights_tensors) > 0 - - gradients_tensors = hook.collection_manager.get("gradients").tensor_names - print(f"'gradients' collection tensors = {gradients_tensors}") - assert len(gradients_tensors) > 0 - - losses_tensors = hook.collection_manager.get("losses").tensor_names - print(f"'losses' collection tensors = {losses_tensors}") - assert len(losses_tensors) > 0 - - assert all( - [name in trial.tensor_names() for name in losses_tensors] - ) - - print("Validation Complete") - -if __name__ == "__main__": - main() - diff --git a/test-toolkit/resources/requirements/entry.py b/test/resources/requirements/entry.py similarity index 100% rename from test-toolkit/resources/requirements/entry.py rename to test/resources/requirements/entry.py diff --git a/test-toolkit/resources/requirements/requirements.txt b/test/resources/requirements/requirements.txt similarity index 100% rename from test-toolkit/resources/requirements/requirements.txt rename to test/resources/requirements/requirements.txt diff --git a/test-toolkit/unit/test_train.py b/test/unit/test_train.py similarity index 100% rename from test-toolkit/unit/test_train.py rename to test/unit/test_train.py diff --git a/test-toolkit/utils/image_utils.py b/test/utils/image_utils.py similarity index 95% rename from test-toolkit/utils/image_utils.py rename to test/utils/image_utils.py index bcc8bd86..4135f6f3 100644 --- a/test-toolkit/utils/image_utils.py +++ b/test/utils/image_utils.py @@ -27,7 +27,7 @@ def build_image(framework_version, dockerfile, image_uri, region, cwd='.'): if 'dlc' in dockerfile: ecr_login(region, DLC_AWS_ID) - dockerfile_location = os.path.join('test-toolkit', 'docker', framework_version, dockerfile) + dockerfile_location = os.path.join('test', 'container', framework_version, dockerfile) subprocess.check_call( ['docker', 'build', '-t', image_uri, '-f', dockerfile_location, '--build-arg', diff --git a/test/utils/local_mode_utils.py b/test/utils/local_mode_utils.py index 23e8945a..acd1197e 100644 --- a/test/utils/local_mode_utils.py +++ b/test/utils/local_mode_utils.py @@ -18,7 +18,7 @@ import tarfile import time -from test.integration import resources_path +from integration import resources_path LOCK_PATH = os.path.join(resources_path, 'local_mode_lock') diff --git a/tox.ini b/tox.ini index deeabe8b..eb89a87b 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,6 @@ exclude = __pycache__ .tox test/resources/ - test-toolkit/resources/ lib/ max-complexity = 10 ignore =