From 10d27c517e202e17fb61582cd1631b0c37032fa2 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Wed, 5 Jun 2019 17:15:39 -0700 Subject: [PATCH 01/33] add git_config and validate method --- src/sagemaker/estimator.py | 55 +++++++++++++++++++++++++++++++++--- src/sagemaker/fw_utils.py | 22 +++++++++++++++ tests/unit/test_estimator.py | 13 +++++++++ tests/unit/test_fw_utils.py | 14 +++++++++ 4 files changed, 100 insertions(+), 4 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 6ff4d7b06e..b8f5abd524 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -15,6 +15,8 @@ import json import logging import os +import subprocess +import tempfile import warnings from abc import ABCMeta from abc import abstractmethod @@ -24,7 +26,7 @@ import sagemaker from sagemaker.analytics import TrainingJobAnalytics from sagemaker.fw_utils import (create_image_uri, tar_and_upload_dir, parse_s3_url, UploadedCode, - validate_source_dir) + validate_source_dir, validate_git_config) from sagemaker.job import _Job from sagemaker.local import LocalSession from sagemaker.model import Model, NEO_ALLOWED_TARGET_INSTANCE_FAMILY, NEO_ALLOWED_FRAMEWORKS @@ -771,13 +773,17 @@ class Framework(EstimatorBase): MPI_NUM_PROCESSES_PER_HOST = 'sagemaker_mpi_num_of_processes_per_host' MPI_CUSTOM_MPI_OPTIONS = 'sagemaker_mpi_custom_mpi_options' - def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cloudwatch_metrics=False, - container_log_level=logging.INFO, code_location=None, image_name=None, dependencies=None, **kwargs): + def __init__(self, entry_point, git_config=None, source_dir=None, hyperparameters=None, + enable_cloudwatch_metrics=False, container_log_level=logging.INFO, code_location=None, + image_name=None, dependencies=None, **kwargs): """Base class initializer. Subclasses which override ``__init__`` should invoke ``super()`` Args: - entry_point (str): Path (absolute or relative) to the local Python source file which should be executed + entry_point (str): Path (absolute or relative) to either: 1. the local Python source file if git_support + is False 2. the Python source file in Git repo if git_support is True, which should be executed as the entry point to training. This should be compatible with either Python 2.7 or Python 3.5. + git_config (dict[str, str]): Git configurations used for cloning files, including 'repo', 'branch' + and 'commit' for now (default: None). source_dir (str): Path (absolute or relative) to a directory with any other training source code dependencies aside from the entry point file (default: None). Structure within this directory are preserved when training on Amazon SageMaker. @@ -815,9 +821,11 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl **kwargs: Additional kwargs passed to the ``EstimatorBase`` constructor. """ super(Framework, self).__init__(**kwargs) + if entry_point.startswith('s3://'): raise ValueError('Invalid entry point script: {}. Must be a path to a local file.'.format(entry_point)) self.entry_point = entry_point + self.git_config = git_config self.source_dir = source_dir self.dependencies = dependencies or [] if enable_cloudwatch_metrics: @@ -830,6 +838,45 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl self._hyperparameters = hyperparameters or {} + def _git_clone_code(self): + """Git clone repo containing the training scripts. + + This method also validate ``git_config``. + Set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. + + + """ + validate_git_config(self.git_config) + # create a temporary directory to store the cloned repo + repo_dir = tempfile.mkdtemp() + try: + subprocess.check_call(['git', 'clone', self.git_config['repo'], repo_dir]) + except subprocess.CalledProcessError: + raise ValueError('Failed to clone git repo.') + + # checkout the specified branch and commit + os.chdir(repo_dir) + try: + subprocess.check_call(['git', 'checkout', self.git_config['branch']]) + except subprocess.CalledProcessError: + raise ValueError('Failed to checkout the required branch.') + try: + subprocess.check_call(['git', 'checkout', self.git_config['commit']]) + except subprocess.CalledProcessError: + raise ValueError('Failed to checkout the required commit.') + + # check if the cloned repo contains entry point and source dir; if so, set ``entry_point`` and + # ``source_dir`` to the paths to local file system. + if not os.path.isfile(os.path.join(repo_dir, self.entry_point)): + raise ValueError('Entry point does not exist in the repo.') + else: + self.entry_point = os.path.join(repo_dir, self.entry_point) + if self.source_dir: + if not os.path.isdir(os.path.join(repo_dir, self.source_dir)): + raise ValueError('Source does not exist in the repo.') + else: + self.source_dir = os.path.join(repo_dir, self.source_dir) + def _prepare_for_training(self, job_name=None): """Set hyperparameters needed for training. This method will also validate ``source_dir``. diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py index f186fb8780..05ed4a894a 100644 --- a/src/sagemaker/fw_utils.py +++ b/src/sagemaker/fw_utils.py @@ -126,6 +126,28 @@ def _accelerator_type_valid_for_framework(framework, accelerator_type=None, opti return True +def validate_git_config(git_config): + """check if a git_config param is valid + + Args: + git_config (dict[str, str]): Git configurations used for cloning files, including 'repo', 'branch', + and 'commit' for now. + + Raises: + ValueError: If: + 1. git_config has no key 'repo' + 2. git_config['repo'] is in the wrong format. + """ + if 'repo' not in git_config: + raise ValueError('Please provide a repo for git_config.') + codecommit_url = git_config['repo'].startswith('https://git-codecommit') \ + or git_config['repo'].startswith('ssh://git-codecommit') + github_url = git_config['repo'].startswith('https://github') \ + or git_config['repo'].startswith('git@github') + if not codecommit_url and not github_url: + raise ValueError('Please provide a valid git repo url.') + + def validate_source_dir(script, directory): """Validate that the source directory exists and it contains the user script diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index c5d7a9f426..a3e34396e2 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -596,6 +596,19 @@ def test_prepare_for_training_force_name_generation(strftime, sagemaker_session) assert JOB_NAME == fw._current_job_name +def test_git_clone_code_succeed(sagemaker_session): + git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', + 'branch': 'branch1', + 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} + fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, + source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + enable_cloudwatch_metrics=True) + fw._git_clone_code() + assert os.path.isfile(fw.entry_point) + assert os.path.isdir(fw.source_dir) + + @patch('time.strftime', return_value=TIMESTAMP) def test_init_with_source_dir_s3(strftime, sagemaker_session): fw = DummyFramework(entry_point=SCRIPT_PATH, source_dir='s3://location', role=ROLE, diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py index ed5e97b631..3a598fb0fc 100644 --- a/tests/unit/test_fw_utils.py +++ b/tests/unit/test_fw_utils.py @@ -120,6 +120,20 @@ def test_create_image_uri_local_sagemaker_notebook_accelerator(): assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mxnet-eia:1.0rc-gpu-py3' +def test_validate_git_config_repo_not_provided(): + git_config = {'branch': 'master', 'username': 'User1', 'password': 'passw0rd'} + with pytest.raises(ValueError) as error: + fw_utils.validate_git_config(git_config) + assert 'Please provide a repo for git_config.' in str(error) + + +def test_validate_git_config_bad_repo_url(): + git_config = {'repo': 'hhttps://github.com/user/repo.git', 'branch': 'master', 'password': 'passw0rd'} + with pytest.raises(ValueError) as error: + fw_utils.validate_git_config(git_config) + assert 'Please provide a valid git repo url.' in str(error) + + def test_invalid_accelerator(): error_message = '{} is not a valid SageMaker Elastic Inference accelerator type.'.format(MOCK_ACCELERATOR) # accelerator type is missing 'ml.' prefix From 6b78ed4be5beed9196f090d303b69d66d10a5960 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Thu, 6 Jun 2019 11:37:48 -0700 Subject: [PATCH 02/33] modify the order of git_config, add tests --- src/sagemaker/estimator.py | 6 ++-- tests/unit/test_estimator.py | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index b8f5abd524..5b88304a20 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -773,9 +773,9 @@ class Framework(EstimatorBase): MPI_NUM_PROCESSES_PER_HOST = 'sagemaker_mpi_num_of_processes_per_host' MPI_CUSTOM_MPI_OPTIONS = 'sagemaker_mpi_custom_mpi_options' - def __init__(self, entry_point, git_config=None, source_dir=None, hyperparameters=None, + def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cloudwatch_metrics=False, container_log_level=logging.INFO, code_location=None, - image_name=None, dependencies=None, **kwargs): + image_name=None, dependencies=None, git_config=None, **kwargs): """Base class initializer. Subclasses which override ``__init__`` should invoke ``super()`` Args: @@ -873,7 +873,7 @@ def _git_clone_code(self): self.entry_point = os.path.join(repo_dir, self.entry_point) if self.source_dir: if not os.path.isdir(os.path.join(repo_dir, self.source_dir)): - raise ValueError('Source does not exist in the repo.') + raise ValueError('Source directory does not exist in the repo.') else: self.source_dir = os.path.join(repo_dir, self.source_dir) diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index a3e34396e2..ac374a1da5 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -609,6 +609,68 @@ def test_git_clone_code_succeed(sagemaker_session): assert os.path.isdir(fw.source_dir) +def test_git_clone_code_clone_fail(sagemaker_session): + git_config = {'repo': 'https://github.com/GaryTu1020/no-such-repo.git', 'branch': 'master'} + fw = DummyFramework(entry_point='entry_point', git_config=git_config, role=ROLE, + sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) + with pytest.raises(ValueError) as error: + fw._git_clone_code() + assert 'Failed to clone git repo.' in str(error) + + +def test_git_clone_code_branch_not_exist(sagemaker_session): + git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', + 'branch': 'branch-that-does-not-exist', + 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} + fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, + source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + enable_cloudwatch_metrics=True) + with pytest.raises(ValueError) as error: + fw._git_clone_code() + assert 'Failed to checkout the required branch.' in str(error) + + +def test_git_clone_code_commit_not_exist(sagemaker_session): + git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', + 'branch': 'master', + 'commit': 'commit-sha-that-does-not-exist'} + fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, + source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + enable_cloudwatch_metrics=True) + with pytest.raises(ValueError) as error: + fw._git_clone_code() + assert 'Failed to checkout the required commit.' in str(error) + + +def test_git_clone_code_entry_point_not_exist(sagemaker_session): + git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', + 'branch': 'branch1', + 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} + fw = DummyFramework(entry_point='entry_point', git_config=git_config, + source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + enable_cloudwatch_metrics=True) + with pytest.raises(ValueError) as error: + fw._git_clone_code() + assert 'Entry point does not exist in the repo.' in str(error) + + +def test_git_clone_code_source_dir_not_exist(sagemaker_session): + git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', + 'branch': 'branch1', + 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} + fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, + source_dir='source_dir_that_does_not_exist', role=ROLE, + sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) + with pytest.raises(ValueError) as error: + fw._git_clone_code() + assert 'Source directory does not exist in the repo.' in str(error) + + @patch('time.strftime', return_value=TIMESTAMP) def test_init_with_source_dir_s3(strftime, sagemaker_session): fw = DummyFramework(entry_point=SCRIPT_PATH, source_dir='s3://location', role=ROLE, From e59bb7922b80f82849d96a3425ac1021ffc414f4 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Sat, 8 Jun 2019 00:44:17 -0700 Subject: [PATCH 03/33] move validate_git_config, add integ test --- src/sagemaker/estimator.py | 134 +++++++++++++++++++++++++++-------- src/sagemaker/fw_utils.py | 22 ------ tests/integ/test_git.py | 56 +++++++++++++++ tests/unit/test_estimator.py | 92 ++++++++++++++++++------ tests/unit/test_fw_utils.py | 14 ---- 5 files changed, 229 insertions(+), 89 deletions(-) create mode 100644 tests/integ/test_git.py diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 5b88304a20..be3f541446 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -26,7 +26,7 @@ import sagemaker from sagemaker.analytics import TrainingJobAnalytics from sagemaker.fw_utils import (create_image_uri, tar_and_upload_dir, parse_s3_url, UploadedCode, - validate_source_dir, validate_git_config) + validate_source_dir) from sagemaker.job import _Job from sagemaker.local import LocalSession from sagemaker.model import Model, NEO_ALLOWED_TARGET_INSTANCE_FAMILY, NEO_ALLOWED_FRAMEWORKS @@ -779,14 +779,25 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, """Base class initializer. Subclasses which override ``__init__`` should invoke ``super()`` Args: - entry_point (str): Path (absolute or relative) to either: 1. the local Python source file if git_support - is False 2. the Python source file in Git repo if git_support is True, which should be executed - as the entry point to training. This should be compatible with either Python 2.7 or Python 3.5. + entry_point (str): Path (absolute or relative) to either: 1. the local Python source file if git_config + is not provided. 2. the Python source file in Git repo if git_config is provided, which should be + executed as the entry point to training. This should be compatible with either Python 2.7 or Python 3.5. git_config (dict[str, str]): Git configurations used for cloning files, including 'repo', 'branch' - and 'commit' for now (default: None). + and 'commit' (default: None). + 'branch' and 'commit' are optional. If 'branch' is not specified, 'master' branch will be used. If + 'commit' is not specified, the latest commit in the required branch will be used. + Example: + + The following config: + >>> git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', + >>> 'branch': 'master', + >>> 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} + results in cloning the repo specified in 'repo', then checkout the 'master' branch, and checkout + the specified commit. source_dir (str): Path (absolute or relative) to a directory with any other training - source code dependencies aside from the entry point file (default: None). Structure within this - directory are preserved when training on Amazon SageMaker. + source code dependencies aside from the entry point file (default: None). If git_config is not provided, + this should be a local path; if git_config is provided, this should be a path in the Git repo. + Structure within this directory are preserved when training on Amazon SageMaker. dependencies (list[str]): A list of paths to directories (absolute or relative) with any additional libraries that will be exported to the container (default: []). The library folders will be copied to SageMaker in the same folder where the entrypoint is copied. @@ -838,44 +849,107 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, self._hyperparameters = hyperparameters or {} - def _git_clone_code(self): - """Git clone repo containing the training scripts. + def fit(self, inputs=None, wait=True, logs=True, job_name=None): + """Train a model using the input training dataset. If gif_config provided, the method does git clone first. - This method also validate ``git_config``. - Set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. + The API calls the Amazon SageMaker CreateTrainingJob API to start model training. + The API uses configuration you provided to create the estimator and the + specified input training data to send the CreatingTrainingJob request to Amazon SageMaker. + + This is a synchronous operation. After the model training successfully completes, + you can call the ``deploy()`` method to host the model using the Amazon SageMaker hosting services. + + Args: + inputs (str or dict or sagemaker.session.s3_input): Information about the training data. + This can be one of three types: + * (str) the S3 location where training data is saved. + * (dict[str, str] or dict[str, sagemaker.session.s3_input]) If using multiple channels for + training data, you can specify a dict mapping channel names + to strings or :func:`~sagemaker.session.s3_input` objects. + * (sagemaker.session.s3_input) - channel configuration for S3 data sources that can provide + additional information as well as the path to the training dataset. + See :func:`sagemaker.session.s3_input` for full details. + wait (bool): Whether the call should wait until the job completes (default: True). + logs (bool): Whether to show the logs produced by the job. + Only meaningful when wait is True (default: True). + job_name (str): Training job name. If not specified, the estimator generates a default job name, + based on the training image name and current timestamp. """ - validate_git_config(self.git_config) + if self.git_config: + self._git_clone_code() + super(Framework, self).fit(inputs=inputs, wait=wait, logs=logs, job_name=job_name) + + def _git_clone_code(self): + """Git clone repo containing the training scripts. This method also validate ``git_config``, + and set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. + + Raises: + CalledProcessError: If 1. failed to clone git repo + 2. failed to checkout the required branch + 3. failed to checkout the required commit + ValueError: If 1. entry point specified does not exist in the repo + 2. source dir specified does not exist in the repo + """ + self._validate_git_config() # create a temporary directory to store the cloned repo repo_dir = tempfile.mkdtemp() try: subprocess.check_call(['git', 'clone', self.git_config['repo'], repo_dir]) except subprocess.CalledProcessError: - raise ValueError('Failed to clone git repo.') + raise subprocess.CalledProcessError(1, cmd='git clone {} {}'.format(self.git_config['repo'], repo_dir)) - # checkout the specified branch and commit - os.chdir(repo_dir) - try: - subprocess.check_call(['git', 'checkout', self.git_config['branch']]) - except subprocess.CalledProcessError: - raise ValueError('Failed to checkout the required branch.') - try: - subprocess.check_call(['git', 'checkout', self.git_config['commit']]) - except subprocess.CalledProcessError: - raise ValueError('Failed to checkout the required commit.') + self._checkout_branch_and_commit(repo_dir) # check if the cloned repo contains entry point and source dir; if so, set ``entry_point`` and # ``source_dir`` to the paths to local file system. - if not os.path.isfile(os.path.join(repo_dir, self.entry_point)): - raise ValueError('Entry point does not exist in the repo.') - else: - self.entry_point = os.path.join(repo_dir, self.entry_point) if self.source_dir: - if not os.path.isdir(os.path.join(repo_dir, self.source_dir)): - raise ValueError('Source directory does not exist in the repo.') - else: + if os.path.isdir(os.path.join(repo_dir, self.source_dir)): self.source_dir = os.path.join(repo_dir, self.source_dir) + os.chdir(self.source_dir) + else: + raise ValueError('Source directory does not exist in the repo.') + if not os.path.isfile(os.path.join(self.source_dir, self.entry_point)): + raise ValueError('Entry point does not exist in the repo.') + + def _checkout_branch_and_commit(self, repo_dir): + """Enter the directory where the repo is cloned, and checkout the required branch and commit. + + Args: + repo_dir: the directory where the repo is cloned + + Raises: + ValueError: If 1. entry point specified does not exist in the repo + 2. source dir specified does not exist in the repo + """ + os.chdir(repo_dir) + if 'branch' in self.git_config: + try: + subprocess.check_call(['git', 'checkout', self.git_config['branch']]) + except subprocess.CalledProcessError: + raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(self.git_config['branch'])) + if 'commit' in self.git_config: + try: + subprocess.check_call(['git', 'checkout', self.git_config['commit']]) + except subprocess.CalledProcessError: + raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(self.git_config['commit'])) + + def _validate_git_config(self): + """check if a git_config param is valid + + Raises: + ValueError: If: + 1. git_config has no key 'repo' + 2. git_config['repo'] is in the wrong format. + """ + if 'repo' not in self.git_config: + raise ValueError('Please provide a repo for git_config.') + repo = self.git_config['repo'] + codecommit_url = repo.startswith('https://git-codecommit') or repo.startswith('ssh://git-codecommit') + github_url = repo.startswith('https://github') or repo.startswith('git@github') + if not codecommit_url and not github_url: + raise ValueError('Please provide a valid git repo url.') def _prepare_for_training(self, job_name=None): """Set hyperparameters needed for training. This method will also validate ``source_dir``. diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py index 05ed4a894a..f186fb8780 100644 --- a/src/sagemaker/fw_utils.py +++ b/src/sagemaker/fw_utils.py @@ -126,28 +126,6 @@ def _accelerator_type_valid_for_framework(framework, accelerator_type=None, opti return True -def validate_git_config(git_config): - """check if a git_config param is valid - - Args: - git_config (dict[str, str]): Git configurations used for cloning files, including 'repo', 'branch', - and 'commit' for now. - - Raises: - ValueError: If: - 1. git_config has no key 'repo' - 2. git_config['repo'] is in the wrong format. - """ - if 'repo' not in git_config: - raise ValueError('Please provide a repo for git_config.') - codecommit_url = git_config['repo'].startswith('https://git-codecommit') \ - or git_config['repo'].startswith('ssh://git-codecommit') - github_url = git_config['repo'].startswith('https://github') \ - or git_config['repo'].startswith('git@github') - if not codecommit_url and not github_url: - raise ValueError('Please provide a valid git repo url.') - - def validate_source_dir(script, directory): """Validate that the source directory exists and it contains the user script diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py new file mode 100644 index 0000000000..b7454ed428 --- /dev/null +++ b/tests/integ/test_git.py @@ -0,0 +1,56 @@ +# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +import numpy + +from sagemaker.pytorch.estimator import PyTorch +from sagemaker.pytorch.model import PyTorchModel +from sagemaker.utils import sagemaker_timestamp +from tests.integ import DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES +from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name + + +def test_git_support_with_pytorch(sagemaker_local_session): + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): + script_path = 'mnist.py' + data_path = os.path.join(DATA_DIR, 'pytorch_mnist') + git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', + 'branch': 'branch1', + 'commit': '1867259a76ee740b99ce7ab00d6a32b582c85e06'} + pytorch = PyTorch(entry_point=script_path, role='SageMakerRole', source_dir='pytorch', + framework_version=PyTorch.LATEST_VERSION, py_version=PYTHON_VERSION, + train_instance_count=1, train_instance_type='ml.c4.xlarge', + sagemaker_session=sagemaker_local_session, git_config=git_config) + + train_input = pytorch.sagemaker_session.upload_data(path=os.path.join(data_path, 'training'), + key_prefix='integ-test-data/pytorch_mnist/training') + pytorch.fit({'training': train_input}) + + files = [file for file in os.listdir(pytorch.source_dir)] + assert files == ['some-file', 'mnist.py'] + + endpoint_name = 'test-git_support-with-pytorch-{}'.format(sagemaker_timestamp()) + + with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_local_session): + desc = sagemaker_local_session.sagemaker_client.describe_training_job(pytorch.latest_training_job.name) + model_data = desc['ModelArtifacts']['S3ModelArtifacts'] + model = PyTorchModel(model_data, 'SageMakerRole', entry_point=script_path, + sagemaker_session=sagemaker_local_session) + predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) + + data = numpy.zeros(shape=(1, 1, 28, 28)) + result = predictor.predict(data) + assert result is not None diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index ac374a1da5..381240e4d9 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -15,6 +15,7 @@ import logging import json import os +import subprocess from time import sleep import pytest @@ -596,78 +597,123 @@ def test_prepare_for_training_force_name_generation(strftime, sagemaker_session) assert JOB_NAME == fw._current_job_name -def test_git_clone_code_succeed(sagemaker_session): +def test_git_support_with_branch_and_commit_succeed(sagemaker_session): git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', 'branch': 'branch1', 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} - fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, + fw = DummyFramework(entry_point='entry_point', git_config=git_config, + source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + enable_cloudwatch_metrics=True) + fw.fit() + assert os.path.isfile(fw.entry_point) + assert os.path.isdir(fw.source_dir) + + +def test_git_support_with_branch_succeed(sagemaker_session): + git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', + 'branch': 'branch1'} + fw = DummyFramework(entry_point='entry_point', git_config=git_config, source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) - fw._git_clone_code() + fw.fit() assert os.path.isfile(fw.entry_point) assert os.path.isdir(fw.source_dir) -def test_git_clone_code_clone_fail(sagemaker_session): +def test_git_support_without_branch_and_commit_succeed(sagemaker_session): + git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git'} + fw = DummyFramework(entry_point='entry_point', git_config=git_config, + role=ROLE, sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, + enable_cloudwatch_metrics=True) + fw.fit() + assert os.path.isfile(fw.entry_point) + + +def test_git_support_repo_not_provided(sagemaker_session): + git_config = {'branch': 'master', + 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} + fw = DummyFramework(entry_point='entry_point', git_config=git_config, + source_dir='source_dir', role=ROLE, + sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) + with pytest.raises(ValueError) as error: + fw.fit() + assert 'Please provide a repo for git_config.' in str(error) + + +def test_git_support_bad_repo_url_format(sagemaker_session): + git_config = {'repo': 'hhttps://github.com/user/repo.git', 'branch': 'master', 'password': 'passw0rd'} + fw = DummyFramework(entry_point='entry_point', git_config=git_config, + source_dir='source_dir', role=ROLE, + sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) + with pytest.raises(ValueError) as error: + fw.fit() + assert 'Please provide a valid git repo url.' in str(error) + + +def test_git_support_git_clone_fail(sagemaker_session): git_config = {'repo': 'https://github.com/GaryTu1020/no-such-repo.git', 'branch': 'master'} fw = DummyFramework(entry_point='entry_point', git_config=git_config, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) - with pytest.raises(ValueError) as error: - fw._git_clone_code() - assert 'Failed to clone git repo.' in str(error) + with pytest.raises(subprocess.CalledProcessError) as error: + fw.fit() + assert "Command 'git clone" in str(error) -def test_git_clone_code_branch_not_exist(sagemaker_session): +def test_git_support_branch_not_exist(sagemaker_session): git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', 'branch': 'branch-that-does-not-exist', 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} - fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, + fw = DummyFramework(entry_point='entry_point', git_config=git_config, source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) - with pytest.raises(ValueError) as error: - fw._git_clone_code() - assert 'Failed to checkout the required branch.' in str(error) + with pytest.raises(subprocess.CalledProcessError) as error: + fw.fit() + assert "Command 'git checkout" in str(error) -def test_git_clone_code_commit_not_exist(sagemaker_session): +def test_git_support_commit_not_exist(sagemaker_session): git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', 'branch': 'master', 'commit': 'commit-sha-that-does-not-exist'} - fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, + fw = DummyFramework(entry_point='entry_point', git_config=git_config, source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) - with pytest.raises(ValueError) as error: - fw._git_clone_code() - assert 'Failed to checkout the required commit.' in str(error) + with pytest.raises(subprocess.CalledProcessError) as error: + fw.fit() + assert "Command 'git checkout" in str(error) -def test_git_clone_code_entry_point_not_exist(sagemaker_session): +def test_git_support_entry_point_not_exist(sagemaker_session): git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', 'branch': 'branch1', 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} - fw = DummyFramework(entry_point='entry_point', git_config=git_config, + fw = DummyFramework(entry_point='entry_point_that_does_not_exist', git_config=git_config, source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) with pytest.raises(ValueError) as error: - fw._git_clone_code() + fw.fit() assert 'Entry point does not exist in the repo.' in str(error) -def test_git_clone_code_source_dir_not_exist(sagemaker_session): +def test_git_support_source_dir_not_exist(sagemaker_session): git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', 'branch': 'branch1', 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} - fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, + fw = DummyFramework(entry_point='entry_point', git_config=git_config, source_dir='source_dir_that_does_not_exist', role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) with pytest.raises(ValueError) as error: - fw._git_clone_code() + fw.fit() assert 'Source directory does not exist in the repo.' in str(error) diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py index 3a598fb0fc..ed5e97b631 100644 --- a/tests/unit/test_fw_utils.py +++ b/tests/unit/test_fw_utils.py @@ -120,20 +120,6 @@ def test_create_image_uri_local_sagemaker_notebook_accelerator(): assert image_uri == '520713654638.dkr.ecr.mars-south-3.amazonaws.com/sagemaker-mxnet-eia:1.0rc-gpu-py3' -def test_validate_git_config_repo_not_provided(): - git_config = {'branch': 'master', 'username': 'User1', 'password': 'passw0rd'} - with pytest.raises(ValueError) as error: - fw_utils.validate_git_config(git_config) - assert 'Please provide a repo for git_config.' in str(error) - - -def test_validate_git_config_bad_repo_url(): - git_config = {'repo': 'hhttps://github.com/user/repo.git', 'branch': 'master', 'password': 'passw0rd'} - with pytest.raises(ValueError) as error: - fw_utils.validate_git_config(git_config) - assert 'Please provide a valid git repo url.' in str(error) - - def test_invalid_accelerator(): error_message = '{} is not a valid SageMaker Elastic Inference accelerator type.'.format(MOCK_ACCELERATOR) # accelerator type is missing 'ml.' prefix From 7808faa22babdfe0f78b88878a21c5d5202498e6 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 10 Jun 2019 15:06:05 -0700 Subject: [PATCH 04/33] modify location _git_clone_code called --- src/sagemaker/estimator.py | 75 ++++++++++++++++++------------------ tests/unit/test_estimator.py | 4 +- 2 files changed, 39 insertions(+), 40 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index be3f541446..41e34c623e 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -779,9 +779,21 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, """Base class initializer. Subclasses which override ``__init__`` should invoke ``super()`` Args: - entry_point (str): Path (absolute or relative) to either: 1. the local Python source file if git_config - is not provided. 2. the Python source file in Git repo if git_config is provided, which should be - executed as the entry point to training. This should be compatible with either Python 2.7 or Python 3.5. + entry_point (str): Path (absolute or relative) to the local Python source file which should be executed + as the entry point to training. This should be compatible with either Python 2.7 or Python 3.5. + If 'git_config' is provided, 'entry_point' should be a relative location to the Python source file in + the Git repo. + Example: + + If the following is the structure of a Git repo: + + >>> |----- README.md + >>> |----- src + >>> |----- train.py + >>> |----- test.py + + and we need 'train.py' as entry point, and 'source_dir' is not provided, then 'entry_point' should + be 'src/train.py'. git_config (dict[str, str]): Git configurations used for cloning files, including 'repo', 'branch' and 'commit' (default: None). 'branch' and 'commit' are optional. If 'branch' is not specified, 'master' branch will be used. If @@ -789,15 +801,28 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, Example: The following config: + >>> git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', >>> 'branch': 'master', >>> 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} + results in cloning the repo specified in 'repo', then checkout the 'master' branch, and checkout the specified commit. source_dir (str): Path (absolute or relative) to a directory with any other training - source code dependencies aside from the entry point file (default: None). If git_config is not provided, - this should be a local path; if git_config is provided, this should be a path in the Git repo. - Structure within this directory are preserved when training on Amazon SageMaker. + source code dependencies aside from the entry point file (default: None). Structure within this + directory are preserved when training on Amazon SageMaker. If 'git_config' is provided, + source_dir should be a relative location to a directory in the Git repo. + Example: + + If the following is the structure of a Git repo: + + >>> |----- README.md + >>> |----- src + >>> |----- train.py + >>> |----- test.py + + and we need 'train.py' as entry point and 'src' as source dir, then 'entry_point' should be + 'train.py', 'source_dir' should be 'src'. dependencies (list[str]): A list of paths to directories (absolute or relative) with any additional libraries that will be exported to the container (default: []). The library folders will be copied to SageMaker in the same folder where the entrypoint is copied. @@ -849,38 +874,6 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, self._hyperparameters = hyperparameters or {} - def fit(self, inputs=None, wait=True, logs=True, job_name=None): - """Train a model using the input training dataset. If gif_config provided, the method does git clone first. - - The API calls the Amazon SageMaker CreateTrainingJob API to start model training. - The API uses configuration you provided to create the estimator and the - specified input training data to send the CreatingTrainingJob request to Amazon SageMaker. - - This is a synchronous operation. After the model training successfully completes, - you can call the ``deploy()`` method to host the model using the Amazon SageMaker hosting services. - - Args: - inputs (str or dict or sagemaker.session.s3_input): Information about the training data. - This can be one of three types: - - * (str) the S3 location where training data is saved. - - * (dict[str, str] or dict[str, sagemaker.session.s3_input]) If using multiple channels for - training data, you can specify a dict mapping channel names - to strings or :func:`~sagemaker.session.s3_input` objects. - * (sagemaker.session.s3_input) - channel configuration for S3 data sources that can provide - additional information as well as the path to the training dataset. - See :func:`sagemaker.session.s3_input` for full details. - wait (bool): Whether the call should wait until the job completes (default: True). - logs (bool): Whether to show the logs produced by the job. - Only meaningful when wait is True (default: True). - job_name (str): Training job name. If not specified, the estimator generates a default job name, - based on the training image name and current timestamp. - """ - if self.git_config: - self._git_clone_code() - super(Framework, self).fit(inputs=inputs, wait=wait, logs=logs, job_name=job_name) - def _git_clone_code(self): """Git clone repo containing the training scripts. This method also validate ``git_config``, and set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. @@ -912,6 +905,9 @@ def _git_clone_code(self): raise ValueError('Source directory does not exist in the repo.') if not os.path.isfile(os.path.join(self.source_dir, self.entry_point)): raise ValueError('Entry point does not exist in the repo.') + else: + if not os.path.isfile(os.path.join(repo_dir, self.entry_point)): + raise ValueError('Entry point does not exist in the repo.') def _checkout_branch_and_commit(self, repo_dir): """Enter the directory where the repo is cloned, and checkout the required branch and commit. @@ -960,6 +956,9 @@ def _prepare_for_training(self, job_name=None): """ super(Framework, self)._prepare_for_training(job_name=job_name) + if self.git_config: + self._git_clone_code() + # validate source dir will raise a ValueError if there is something wrong with the # source directory. We are intentionally not handling it because this is a critical error. if self.source_dir and not self.source_dir.lower().startswith('s3://'): diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 381240e4d9..274b8afcf2 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -695,8 +695,8 @@ def test_git_support_entry_point_not_exist(sagemaker_session): git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', 'branch': 'branch1', 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} - fw = DummyFramework(entry_point='entry_point_that_does_not_exist', git_config=git_config, - source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, + fw = DummyFramework(entry_point='entry_point', git_config=git_config, + role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) with pytest.raises(ValueError) as error: From 685750a53d77ebebeddbeb8f1105b03bc7830c28 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Wed, 12 Jun 2019 15:56:43 -0700 Subject: [PATCH 05/33] create git_utils, write test for dependencies --- src/sagemaker/estimator.py | 164 ++++++++++++++++++----------------- src/sagemaker/git_utils.py | 110 +++++++++++++++++++++++ src/sagemaker/model.py | 43 +++++++-- tests/unit/test_git_utils.py | 0 4 files changed, 229 insertions(+), 88 deletions(-) create mode 100644 src/sagemaker/git_utils.py create mode 100644 tests/unit/test_git_utils.py diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index c900e3e0c8..1841506b8e 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -15,8 +15,7 @@ import json import logging import os -import subprocess -import tempfile + import warnings from abc import ABCMeta from abc import abstractmethod @@ -25,8 +24,9 @@ import sagemaker from sagemaker.analytics import TrainingJobAnalytics -from sagemaker.fw_utils import (create_image_uri, tar_and_upload_dir, parse_s3_url, UploadedCode, - validate_source_dir) +from sagemaker.fw_utils import create_image_uri, tar_and_upload_dir, parse_s3_url, UploadedCode, validate_source_dir +from sagemaker.git_utils import git_clone_repo_and_enter + from sagemaker.job import _Job from sagemaker.local import LocalSession from sagemaker.model import Model, NEO_ALLOWED_TARGET_INSTANCE_FAMILY, NEO_ALLOWED_FRAMEWORKS @@ -813,7 +813,7 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, source_dir (str): Path (absolute or relative) to a directory with any other training source code dependencies aside from the entry point file (default: None). Structure within this directory are preserved when training on Amazon SageMaker. If 'git_config' is provided, - source_dir should be a relative location to a directory in the Git repo. + 'source_dir' should be a relative location to a directory in the Git repo. Example: With the following GitHub repo directory structure: @@ -827,6 +827,8 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, dependencies (list[str]): A list of paths to directories (absolute or relative) with any additional libraries that will be exported to the container (default: []). The library folders will be copied to SageMaker in the same folder where the entrypoint is copied. + If 'git_config' is provided, 'dependencies' should be a list of relative locations to directories + with any additional libraries needed in the Git repo. Example: The following call @@ -875,81 +877,80 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, self._hyperparameters = hyperparameters or {} - def _git_clone_code(self): - """Git clone repo containing the training scripts. This method also validate ``git_config``, - and set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. - - Raises: - CalledProcessError: If 1. failed to clone git repo - 2. failed to checkout the required branch - 3. failed to checkout the required commit - ValueError: If 1. entry point specified does not exist in the repo - 2. source dir specified does not exist in the repo - """ - self._validate_git_config() - # create a temporary directory to store the cloned repo - repo_dir = tempfile.mkdtemp() - try: - subprocess.check_call(['git', 'clone', self.git_config['repo'], repo_dir]) - except subprocess.CalledProcessError: - raise subprocess.CalledProcessError(1, cmd='git clone {} {}'.format(self.git_config['repo'], repo_dir)) - - self._checkout_branch_and_commit(repo_dir) - - # check if the cloned repo contains entry point and source dir; if so, set ``entry_point`` and - # ``source_dir`` to the paths to local file system. - if self.source_dir: - if os.path.isdir(os.path.join(repo_dir, self.source_dir)): - self.source_dir = os.path.join(repo_dir, self.source_dir) - os.chdir(self.source_dir) - else: - raise ValueError('Source directory does not exist in the repo.') - if not os.path.isfile(os.path.join(self.source_dir, self.entry_point)): - raise ValueError('Entry point does not exist in the repo.') - else: - if not os.path.isfile(os.path.join(repo_dir, self.entry_point)): - raise ValueError('Entry point does not exist in the repo.') - for path in self.dependencies: - if not os.path.isdir(os.path.join(repo_dir, path)): - raise ValueError('Dependency {} does not exist in the repo.'.format(path)) - - def _checkout_branch_and_commit(self, repo_dir): - """Enter the directory where the repo is cloned, and checkout the required branch and commit. - - Args: - repo_dir: the directory where the repo is cloned - - Raises: - ValueError: If 1. entry point specified does not exist in the repo - 2. source dir specified does not exist in the repo - """ - os.chdir(repo_dir) - if 'branch' in self.git_config: - try: - subprocess.check_call(['git', 'checkout', self.git_config['branch']]) - except subprocess.CalledProcessError: - raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(self.git_config['branch'])) - if 'commit' in self.git_config: - try: - subprocess.check_call(['git', 'checkout', self.git_config['commit']]) - except subprocess.CalledProcessError: - raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(self.git_config['commit'])) - - def _validate_git_config(self): - """check if a git_config param is valid - - Raises: - ValueError: If: - 1. git_config has no key 'repo' - 2. git_config['repo'] is in the wrong format. - """ - if 'repo' not in self.git_config: - raise ValueError('Please provide a repo for git_config.') - repo = self.git_config['repo'] - codecommit_url = repo.startswith('https://git-codecommit') or repo.startswith('ssh://git-codecommit') - github_url = repo.startswith('https://github') or repo.startswith('git@github') - if not codecommit_url and not github_url: - raise ValueError('Please provide a valid git repo url.') + # def _git_clone_code(self): + # """Git clone repo containing the training scripts. This method also validate ``git_config``, + # and set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. + # + # Raises: + # CalledProcessError: If 1. failed to clone git repo + # 2. failed to checkout the required branch + # 3. failed to checkout the required commit + # ValueError: If 1. entry point specified does not exist in the repo + # 2. source dir specified does not exist in the repo + # """ + # self._validate_git_config() + # # create a temporary directory to store the cloned repo + # repo_dir = tempfile.mkdtemp() + # try: + # subprocess.check_call(['git', 'clone', self.git_config['repo'], repo_dir]) + # except subprocess.CalledProcessError: + # raise subprocess.CalledProcessError(1, cmd='git clone {} {}'.format(self.git_config['repo'], repo_dir)) + # + # self._checkout_branch_and_commit(repo_dir) + # + # # check if the cloned repo contains entry point, source dir and dependencies + # if self.source_dir: + # if os.path.isdir(os.path.join(repo_dir, self.source_dir)): + # self.source_dir = os.path.join(repo_dir, self.source_dir) + # os.chdir(self.source_dir) + # else: + # raise ValueError('Source directory does not exist in the repo.') + # if not os.path.isfile(os.path.join(self.source_dir, self.entry_point)): + # raise ValueError('Entry point does not exist in the repo.') + # else: + # if not os.path.isfile(os.path.join(repo_dir, self.entry_point)): + # raise ValueError('Entry point does not exist in the repo.') + # for path in self.dependencies: + # if not os.path.isdir(os.path.join(repo_dir, path)): + # raise ValueError('Dependency {} does not exist in the repo.'.format(path)) + # + # def _checkout_branch_and_commit(self, repo_dir): + # """Enter the directory where the repo is cloned, and checkout the required branch and commit. + # + # Args: + # repo_dir: the directory where the repo is cloned + # + # Raises: + # ValueError: If 1. entry point specified does not exist in the repo + # 2. source dir specified does not exist in the repo + # """ + # os.chdir(repo_dir) + # if 'branch' in self.git_config: + # try: + # subprocess.check_call(['git', 'checkout', self.git_config['branch']]) + # except subprocess.CalledProcessError: + # raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(self.git_config['branch'])) + # if 'commit' in self.git_config: + # try: + # subprocess.check_call(['git', 'checkout', self.git_config['commit']]) + # except subprocess.CalledProcessError: + # raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(self.git_config['commit'])) + # + # def _validate_git_config(self): + # """check if a git_config param is valid + # + # Raises: + # ValueError: If: + # 1. git_config has no key 'repo' + # 2. git_config['repo'] is in the wrong format. + # """ + # if 'repo' not in self.git_config: + # raise ValueError('Please provide a repo for git_config.') + # repo = self.git_config['repo'] + # codecommit_url = repo.startswith('https://git-codecommit') or repo.startswith('ssh://git-codecommit') + # github_url = repo.startswith('https://github') or repo.startswith('git@github') + # if not codecommit_url and not github_url: + # raise ValueError('Please provide a valid git repo url.') def _prepare_for_training(self, job_name=None): """Set hyperparameters needed for training. This method will also validate ``source_dir``. @@ -961,7 +962,8 @@ def _prepare_for_training(self, job_name=None): super(Framework, self)._prepare_for_training(job_name=job_name) if self.git_config: - self._git_clone_code() + # self._git_clone_code() + git_clone_repo_and_enter(self.git_config, self.entry_point, self.source_dir, self.dependencies) # validate source dir will raise a ValueError if there is something wrong with the # source directory. We are intentionally not handling it because this is a critical error. diff --git a/src/sagemaker/git_utils.py b/src/sagemaker/git_utils.py new file mode 100644 index 0000000000..a8733630e8 --- /dev/null +++ b/src/sagemaker/git_utils.py @@ -0,0 +1,110 @@ +# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import subprocess +import tempfile + + +def git_clone_repo_and_enter(git_config, entry_point, source_dir=None, dependencies=None): + """Git clone repo containing the training code and serving code. This method also validate ``git_config``, + and set ``entry_point``, ``source_dir`` and ``dependencies`` to the right file or directory in the repo cloned. + This method also change the current working path to the repo cloned. + + Args: + git_config (dict[str, str]): Git configurations used for cloning files, including ``repo``, ``branch`` + and ``commit``. ``branch`` and ``commit`` are optional. If ``branch`` is not specified, master branch + will be used. If ``commit`` is not specified, the latest commit in the required branch will be used. + entry_point (str): A relative location to the Python source file which should be executed as the entry point + to training or model hosting in the Git repo. + source_dir (str): A relative location to a directory with other training or model hosting source code + dependencies aside from the entry point file in the Git repo (default: None). Structure within this + directory are preserved when training on Amazon SageMaker. + dependencies (list[str]): A list of relative locations to directories with any additional libraries that will + be exported to the container in the Git repo (default: []). + + + Raises: + CalledProcessError: If 1. failed to clone git repo + 2. failed to checkout the required branch + 3. failed to checkout the required commit + ValueError: If 1. entry point specified does not exist in the repo + 2. source dir specified does not exist in the repo + """ + validate_git_config(git_config) + repo_dir = tempfile.mkdtemp() + try: + subprocess.check_call(['git', 'clone', git_config['repo'], repo_dir]) + except subprocess.CalledProcessError: + raise subprocess.CalledProcessError(1, cmd='git clone {} {}'.format(git_config['repo'], repo_dir)) + + checkout_branch_and_commit(git_config, repo_dir) + + # check if the cloned repo contains entry point, source directory and dependencies + if source_dir: + if not os.path.isdir(os.path.join(repo_dir, source_dir)): + raise ValueError('Source directory does not exist in the repo.') + if not os.path.isfile(os.path.join(repo_dir, source_dir, entry_point)): + raise ValueError('Entry point does not exist in the repo.') + else: + if not os.path.isfile(os.path.join(repo_dir, entry_point)): + raise ValueError('Entry point does not exist in the repo.') + for path in dependencies: + if not os.path.exists(os.path.join(repo_dir, path)): + raise ValueError('Dependency {} does not exist in the repo.'.format(path)) + + +def validate_git_config(git_config): + """check if a git_config param is valid + + Args: + git_config ((dict[str, str]): Git configurations used for cloning files, including ``repo``, ``branch`` + and ``commit``. + + Raises: + ValueError: If: + 1. git_config has no key 'repo' + 2. git_config['repo'] is in the wrong format. + """ + if 'repo' not in git_config: + raise ValueError('Please provide a repo for git_config.') + repo = git_config['repo'] + github_url = repo.startswith('https://github') or repo.startswith('git@github') + if not github_url: + raise ValueError('Please provide a valid git repo url.') + + +def checkout_branch_and_commit(git_config, repo_dir): + """Enter the directory where the repo is cloned, and checkout the required branch and commit. + + Args: + git_config: (dict[str, str]): Git configurations used for cloning files, including ``repo``, ``branch`` + and ``commit``. + repo_dir (str): the directory where the repo is cloned + + Raises: + ValueError: If 1. entry point specified does not exist in the repo + 2. source dir specified does not exist in the repo + """ + os.chdir(repo_dir) + if 'branch' in git_config: + try: + subprocess.check_call(['git', 'checkout', git_config['branch']]) + except subprocess.CalledProcessError: + raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(git_config['branch'])) + if 'commit' in git_config: + try: + subprocess.check_call(['git', 'checkout', git_config['commit']]) + except subprocess.CalledProcessError: + raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(git_config['commit'])) diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index a0412e2ff0..2b5af93ccb 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -20,6 +20,7 @@ from sagemaker import fw_utils, local, session, utils from sagemaker.fw_utils import UploadedCode from sagemaker.transformer import Transformer +from sagemaker.git_utils import git_clone_repo_and_enter LOGGER = logging.getLogger('sagemaker') @@ -366,7 +367,7 @@ class FrameworkModel(Model): def __init__(self, model_data, image, role, entry_point, source_dir=None, predictor_cls=None, env=None, name=None, enable_cloudwatch_metrics=False, container_log_level=logging.INFO, code_location=None, - sagemaker_session=None, dependencies=None, **kwargs): + sagemaker_session=None, dependencies=None, git_config=None, **kwargs): """Initialize a ``FrameworkModel``. Args: @@ -375,24 +376,49 @@ def __init__(self, model_data, image, role, entry_point, source_dir=None, predic role (str): An IAM role name or ARN for SageMaker to access AWS resources on your behalf. entry_point (str): Path (absolute or relative) to the Python source file which should be executed as the entry point to model hosting. This should be compatible with either Python 2.7 or Python 3.5. + If 'git_config' is provided, 'entry_point' should be a relative location to the Python source file in + the Git repo. + Example: + + With the following GitHub repo directory structure: + + >>> |----- README.md + >>> |----- src + >>> |----- inference.py + >>> |----- test.py + + You can assign entry_point='src/inference.py'. source_dir (str): Path (absolute or relative) to a directory with any other training source code dependencies aside from the entry point file (default: None). Structure within this - directory will be preserved when training on SageMaker. - If the directory points to S3, no code will be uploaded and the S3 location will be used instead. + directory will be preserved when training on SageMaker. If 'git_config' is provided, + 'source_dir' should be a relative location to a directory in the Git repo. If the directory points + to S3, no code will be uploaded and the S3 location will be used instead. + Example: + + With the following GitHub repo directory structure: + + >>> |----- README.md + >>> |----- src + >>> |----- inference.py + >>> |----- test.py + + You can assign entry_point='inference.py', source_dir='src'. dependencies (list[str]): A list of paths to directories (absolute or relative) with any additional libraries that will be exported to the container (default: []). The library folders will be copied to SageMaker in the same folder where the entrypoint is copied. - If the ```source_dir``` points to S3, code will be uploaded and the S3 location will be used - instead. Example: + If 'git_config' is provided, 'dependencies' should be a list of relative locations to directories + with any additional libraries needed in the Git repo. If the ```source_dir``` points to S3, code + will be uploaded and the S3 location will be used instead. + Example: The following call - >>> Estimator(entry_point='train.py', dependencies=['my/libs/common', 'virtual-env']) + >>> Estimator(entry_point='inference.py', dependencies=['my/libs/common', 'virtual-env']) results in the following inside the container: >>> $ ls >>> opt/ml/code - >>> |------ train.py + >>> |------ inference.py >>> |------ common >>> |------ virtual-env @@ -417,6 +443,7 @@ def __init__(self, model_data, image, role, entry_point, source_dir=None, predic self.entry_point = entry_point self.source_dir = source_dir self.dependencies = dependencies or [] + self.git_config = git_config self.enable_cloudwatch_metrics = enable_cloudwatch_metrics self.container_log_level = container_log_level if code_location: @@ -440,6 +467,8 @@ def prepare_container_def(self, instance_type, accelerator_type=None): # pylint dict[str, str]: A container definition object usable with the CreateModel API. """ deploy_key_prefix = fw_utils.model_code_key_prefix(self.key_prefix, self.name, self.image) + if self.git_config: + git_clone_repo_and_enter(self.git_config, self.entry_point, self.source_dir, self.dependencies) self._upload_code(deploy_key_prefix) deploy_env = dict(self.env) deploy_env.update(self._framework_env_vars()) diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py new file mode 100644 index 0000000000..e69de29bb2 From 52e5625095958a6d6db59971843aab78f256ba9f Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Wed, 12 Jun 2019 16:00:05 -0700 Subject: [PATCH 06/33] create git_utils, write test for dependencies --- tests/integ/test_git.py | 65 +++++++++++++++++++++++++++++++++++- tests/unit/test_estimator.py | 11 +++--- tests/unit/test_git_utils.py | 32 ++++++++++++++++++ 3 files changed, 103 insertions(+), 5 deletions(-) diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index f0fe79e21c..ea4f6c6fbe 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -16,6 +16,8 @@ import numpy +from sagemaker.mxnet.estimator import MXNet +from sagemaker.mxnet.model import MXNetModel from sagemaker.pytorch.estimator import PyTorch from sagemaker.pytorch.model import PyTorchModel from sagemaker.utils import sagemaker_timestamp @@ -23,7 +25,7 @@ from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' BRANCH = 'branch1' -COMMIT = '4893e528afa4a790331e1b5286954f073b0f14a2' +COMMIT = 'b61c450200d6a309c8d24ac14b8adddc405acc56' def test_git_support_with_pytorch(sagemaker_local_session): @@ -55,3 +57,64 @@ def test_git_support_with_pytorch(sagemaker_local_session): data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None + + +def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version): + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): + script_path = 'mnist.py' + data_path = os.path.join(DATA_DIR, 'mxnet_mnist') + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + dependencies = ['foo/bar.py'] + mx = MXNet(entry_point=script_path, role='SageMakerRole', + source_dir='mxnet', dependencies=dependencies, + framework_version=MXNet.LATEST_VERSION, py_version=PYTHON_VERSION, + train_instance_count=1, train_instance_type='ml.c4.xlarge', + sagemaker_session=sagemaker_local_session, git_config=git_config) + + train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), + key_prefix='integ-test-data/mxnet_mnist/train') + test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), + key_prefix='integ-test-data/mxnet_mnist/test') + + mx.fit({'train': train_input, 'test': test_input}) + + files = [file for file in os.listdir(mx.source_dir)] + assert files == ['some_file', 'mnist.py'] + + endpoint_name = 'test-git_support-with-mxnet-{}'.format(sagemaker_timestamp()) + + with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_local_session): + desc = sagemaker_local_session.sagemaker_client.describe_training_job(mx.latest_training_job.name) + model_data = desc['ModelArtifacts']['S3ModelArtifacts'] + model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_path, + py_version=PYTHON_VERSION, sagemaker_session=sagemaker_local_session, + framework_version=mxnet_full_version) + predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) + + data = numpy.zeros(shape=(1, 1, 28, 28)) + result = predictor.predict(data) + assert result is not None + + +# def test_git_support_for_source_dirs_and_dependencies(sagemaker_local_session): +# source_dir = 'pytorch_source_dirs' +# lib = 'alexa.py' +# git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} +# +# with open(lib, 'w') as f: +# f.write('def question(to_anything): return 42') +# +# estimator = PyTorch(entry_point='train.py', role='SageMakerRole', source_dir=source_dir, +# dependencies=[lib], git_config=git_config, +# py_version=PYTHON_VERSION, train_instance_count=1, +# train_instance_type='local', +# sagemaker_session=sagemaker_local_session) +# estimator.fit() +# +# with local_mode_utils.lock(): +# try: +# predictor = estimator.deploy(initial_instance_count=1, instance_type='local') +# predict_response = predictor.predict([7]) +# assert predict_response == [49] +# finally: +# estimator.delete_endpoint() diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 85728b66ed..e57ff4243e 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -50,7 +50,7 @@ OUTPUT_PATH = 's3://bucket/prefix' GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' BRANCH = 'branch1' -COMMIT = '4893e528afa4a790331e1b5286954f073b0f14a2' +COMMIT = 'b61c450200d6a309c8d24ac14b8adddc405acc56' DESCRIBE_TRAINING_JOB_RESULT = { 'ModelArtifacts': { @@ -607,7 +607,7 @@ def test_git_support_with_branch_and_commit_succeed(sagemaker_session): train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() - assert os.path.isfile(fw.entry_point) + assert os.path.isfile(os.path.join(fw.source_dir, fw.entry_point)) assert os.path.isdir(fw.source_dir) @@ -618,18 +618,21 @@ def test_git_support_with_branch_succeed(sagemaker_session): train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() - assert os.path.isfile(fw.entry_point) + assert os.path.isfile(os.path.join(fw.source_dir, fw.entry_point)) assert os.path.isdir(fw.source_dir) def test_git_support_with_dependencies_succeed(sagemaker_session): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + dependencies = ['foo', 'foo/bar'] fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, - dependencies=['foo', 'foo/bar'], role=ROLE, sagemaker_session=sagemaker_session, + dependencies=dependencies, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() assert os.path.isfile(fw.entry_point) + for directory in dependencies: + assert os.path.exists(directory) def test_git_support_without_branch_and_commit_succeed(sagemaker_session): diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index e69de29bb2..2de0b41627 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -0,0 +1,32 @@ +# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os + +from sagemaker.git_utils import git_clone_repo_and_enter + +GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' +BRANCH = 'branch1' +COMMIT = '4893e528afa4a790331e1b5286954f073b0f14a2' + + +def test_git_clone_repo_and_enter(): + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point' + source_dir = 'source_dir' + dependencies = ['foo', 'foo/bar'] + git_clone_repo_and_enter(git_config, entry_point, source_dir, dependencies) + assert os.path.isfile(os.path.join(source_dir, entry_point)) + for directory in dependencies: + assert os.path.exists(directory) From f3978505153f615fc07f49f36bb179c1976d8361 Mon Sep 17 00:00:00 2001 From: GaryTu1020 <45720913+GaryTu1020@users.noreply.github.com> Date: Wed, 12 Jun 2019 16:13:29 -0700 Subject: [PATCH 07/33] Update doc/overview.rst Co-Authored-By: Marcio Vinicius dos Santos --- doc/overview.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/overview.rst b/doc/overview.rst index adf7daa612..74e53338f9 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -86,7 +86,7 @@ For more `information Date: Wed, 12 Jun 2019 16:14:37 -0700 Subject: [PATCH 08/33] Update doc/overview.rst Co-Authored-By: Marcio Vinicius dos Santos --- doc/overview.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/overview.rst b/doc/overview.rst index 74e53338f9..ca9312d2cd 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -87,7 +87,7 @@ For more `information Date: Thu, 13 Jun 2019 15:08:11 -0700 Subject: [PATCH 09/33] add more integ tests --- doc/overview.rst | 68 ++++++++++++++++++------------- src/sagemaker/estimator.py | 68 +++++++++++++------------------ tests/integ/test_git.py | 79 ++++++++++++++++++++++++------------ tests/unit/test_estimator.py | 18 ++++---- 4 files changed, 130 insertions(+), 103 deletions(-) diff --git a/doc/overview.rst b/doc/overview.rst index adf7daa612..6ae3c3f16c 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -86,53 +86,63 @@ For more `information >> |----- train.py >>> |----- test.py - You can assign entry_point='train.py', source_dir='src'. + and you need 'train.py' as entry point and 'test.py' as training source code as well, you can + assign entry_point='train.py', source_dir='src'. dependencies (list[str]): A list of paths to directories (absolute or relative) with any additional libraries that will be exported to the container (default: []). The library folders will be copied to SageMaker in the same folder where the entrypoint is copied. @@ -876,8 +877,8 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, self._hyperparameters = hyperparameters or {} def _git_clone_code(self): - """Git clone repo containing the training scripts. This method also validate ``git_config``, - and set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. + """Git clone repo containing the training scripts and enter that directory. This method also validate + ``git_config``, and set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. Raises: CalledProcessError: If 1. failed to clone git repo @@ -889,29 +890,39 @@ def _git_clone_code(self): self._validate_git_config() # create a temporary directory to store the cloned repo repo_dir = tempfile.mkdtemp() - try: - subprocess.check_call(['git', 'clone', self.git_config['repo'], repo_dir]) - except subprocess.CalledProcessError: - raise subprocess.CalledProcessError(1, cmd='git clone {} {}'.format(self.git_config['repo'], repo_dir)) + # try: + subprocess.check_call(['git', 'clone', self.git_config['repo'], repo_dir]) + # except subprocess.CalledProcessError: + # raise subprocess.CalledProcessError(1, cmd='git clone {} {}'.format(self.git_config['repo'], repo_dir)) self._checkout_branch_and_commit(repo_dir) - # check if the cloned repo contains entry point and source dir; if so, set ``entry_point`` and - # ``source_dir`` to the paths to local file system. + # check if the cloned repo contains entry point, source directory and dependencies if self.source_dir: - if os.path.isdir(os.path.join(repo_dir, self.source_dir)): - self.source_dir = os.path.join(repo_dir, self.source_dir) - os.chdir(self.source_dir) - else: + if not os.path.isdir(os.path.join(repo_dir, self.source_dir)): raise ValueError('Source directory does not exist in the repo.') - if not os.path.isfile(os.path.join(self.source_dir, self.entry_point)): + if not os.path.isfile(os.path.join(repo_dir, self.source_dir, self.entry_point)): raise ValueError('Entry point does not exist in the repo.') + self.source_dir = os.path.join(repo_dir, self.source_dir) else: if not os.path.isfile(os.path.join(repo_dir, self.entry_point)): raise ValueError('Entry point does not exist in the repo.') + self.entry_point = os.path.join(repo_dir, self.entry_point) + dependencies = [] for path in self.dependencies: - if not os.path.isdir(os.path.join(repo_dir, path)): + if not os.path.exists(os.path.join(repo_dir, path)): raise ValueError('Dependency {} does not exist in the repo.'.format(path)) + dependencies.append(os.path.join(repo_dir, path)) + self.dependencies = dependencies + + def _validate_git_config(self): + """check if a git_config param is valid + + Raises: + ValueError: If 'git_config' has no key 'repo' + """ + if 'repo' not in self.git_config: + raise ValueError('Please provide a repo for git_config.') def _checkout_branch_and_commit(self, repo_dir): """Enter the directory where the repo is cloned, and checkout the required branch and commit. @@ -923,33 +934,10 @@ def _checkout_branch_and_commit(self, repo_dir): ValueError: If 1. entry point specified does not exist in the repo 2. source dir specified does not exist in the repo """ - os.chdir(repo_dir) if 'branch' in self.git_config: - try: - subprocess.check_call(['git', 'checkout', self.git_config['branch']]) - except subprocess.CalledProcessError: - raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(self.git_config['branch'])) + subprocess.check_call(['git', 'checkout', self.git_config['branch']], cwd=str(repo_dir)) if 'commit' in self.git_config: - try: - subprocess.check_call(['git', 'checkout', self.git_config['commit']]) - except subprocess.CalledProcessError: - raise subprocess.CalledProcessError(1, cmd='git checkout {}'.format(self.git_config['commit'])) - - def _validate_git_config(self): - """check if a git_config param is valid - - Raises: - ValueError: If: - 1. git_config has no key 'repo' - 2. git_config['repo'] is in the wrong format. - """ - if 'repo' not in self.git_config: - raise ValueError('Please provide a repo for git_config.') - repo = self.git_config['repo'] - codecommit_url = repo.startswith('https://git-codecommit') or repo.startswith('ssh://git-codecommit') - github_url = repo.startswith('https://github') or repo.startswith('git@github') - if not codecommit_url and not github_url: - raise ValueError('Please provide a valid git repo url.') + subprocess.check_call(['git', 'checkout', self.git_config['commit']], cwd=str(repo_dir)) def _prepare_for_training(self, job_name=None): """Set hyperparameters needed for training. This method will also validate ``source_dir``. diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index f0fe79e21c..4e27b1418d 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -16,42 +16,69 @@ import numpy +from sagemaker.mxnet.estimator import MXNet +from sagemaker.mxnet.model import MXNetModel from sagemaker.pytorch.estimator import PyTorch from sagemaker.pytorch.model import PyTorchModel from sagemaker.utils import sagemaker_timestamp -from tests.integ import DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES -from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name +from tests.integ import DATA_DIR, PYTHON_VERSION + GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' BRANCH = 'branch1' -COMMIT = '4893e528afa4a790331e1b5286954f073b0f14a2' +COMMIT = 'b61c450200d6a309c8d24ac14b8adddc405acc56' def test_git_support_with_pytorch(sagemaker_local_session): - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - script_path = 'mnist.py' - data_path = os.path.join(DATA_DIR, 'pytorch_mnist') - git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} - pytorch = PyTorch(entry_point=script_path, role='SageMakerRole', source_dir='pytorch', - framework_version=PyTorch.LATEST_VERSION, py_version=PYTHON_VERSION, - train_instance_count=1, train_instance_type='ml.c4.xlarge', - sagemaker_session=sagemaker_local_session, git_config=git_config) - - train_input = pytorch.sagemaker_session.upload_data(path=os.path.join(data_path, 'training'), + script_path = 'mnist.py' + data_path = os.path.join(DATA_DIR, 'pytorch_mnist') + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + pytorch = PyTorch(entry_point=script_path, role='SageMakerRole', source_dir='pytorch', + framework_version=PyTorch.LATEST_VERSION, py_version=PYTHON_VERSION, + train_instance_count=1, train_instance_type='local', + sagemaker_session=sagemaker_local_session, git_config=git_config) + + train_input = pytorch.sagemaker_session.upload_data(path=os.path.join(data_path, 'training'), key_prefix='integ-test-data/pytorch_mnist/training') - pytorch.fit({'training': train_input}) + pytorch.fit({'training': train_input}) + + predictor = pytorch.deploy(initial_instance_count=1, instance_type='local') + + data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32) + result = predictor.predict(data) + assert result is not None + + +def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version): + script_path = 'mnist.py' + data_path = os.path.join(DATA_DIR, 'mxnet_mnist') + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + dependencies = ['foo/bar.py'] + mx = MXNet(entry_point=script_path, role='SageMakerRole', + source_dir='mxnet', dependencies=dependencies, + framework_version=MXNet.LATEST_VERSION, py_version=PYTHON_VERSION, + train_instance_count=1, train_instance_type='local', + sagemaker_session=sagemaker_local_session, git_config=git_config) + + train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), + key_prefix='integ-test-data/mxnet_mnist/train') + test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), + key_prefix='integ-test-data/mxnet_mnist/test') + + mx.fit({'train': train_input, 'test': test_input}) - files = [file for file in os.listdir(pytorch.source_dir)] - assert files == ['some-file', 'mnist.py'] + files = [file for file in os.listdir(mx.source_dir)] + assert files == ['some_file', 'mnist.py'] - endpoint_name = 'test-git_support-with-pytorch-{}'.format(sagemaker_timestamp()) + endpoint_name = 'test-git_support-with-mxnet-{}'.format(sagemaker_timestamp()) - with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_local_session): - desc = sagemaker_local_session.sagemaker_client.describe_training_job(pytorch.latest_training_job.name) - model_data = desc['ModelArtifacts']['S3ModelArtifacts'] - model = PyTorchModel(model_data, 'SageMakerRole', entry_point=script_path, - sagemaker_session=sagemaker_local_session) - predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) + script_abs_path = os.path.join(mx.source_dir, script_path) + desc = sagemaker_local_session.sagemaker_client.describe_training_job(mx.latest_training_job.name) + model_data = desc['ModelArtifacts']['S3ModelArtifacts'] + model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_abs_path, + py_version=PYTHON_VERSION, sagemaker_session=sagemaker_local_session, + framework_version=mxnet_full_version) + predictor = model.deploy(1, 'local', endpoint_name=endpoint_name) - data = numpy.zeros(shape=(1, 1, 28, 28)) - result = predictor.predict(data) - assert result is not None + data = numpy.zeros(shape=(1, 1, 28, 28)) + result = predictor.predict(data) + assert result is not None diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 85728b66ed..9458806a21 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -50,7 +50,7 @@ OUTPUT_PATH = 's3://bucket/prefix' GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' BRANCH = 'branch1' -COMMIT = '4893e528afa4a790331e1b5286954f073b0f14a2' +COMMIT = 'b61c450200d6a309c8d24ac14b8adddc405acc56' DESCRIBE_TRAINING_JOB_RESULT = { 'ModelArtifacts': { @@ -607,7 +607,7 @@ def test_git_support_with_branch_and_commit_succeed(sagemaker_session): train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() - assert os.path.isfile(fw.entry_point) + assert os.path.isfile(os.path.join(fw.source_dir, fw.entry_point)) assert os.path.isdir(fw.source_dir) @@ -618,7 +618,7 @@ def test_git_support_with_branch_succeed(sagemaker_session): train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() - assert os.path.isfile(fw.entry_point) + assert os.path.isfile(os.path.join(fw.source_dir, fw.entry_point)) assert os.path.isdir(fw.source_dir) @@ -630,6 +630,8 @@ def test_git_support_with_dependencies_succeed(sagemaker_session): enable_cloudwatch_metrics=True) fw.fit() assert os.path.isfile(fw.entry_point) + for item in fw.dependencies: + assert os.path.exists(item) def test_git_support_without_branch_and_commit_succeed(sagemaker_session): @@ -659,9 +661,9 @@ def test_git_support_bad_repo_url_format(sagemaker_session): source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) - with pytest.raises(ValueError) as error: + with pytest.raises(subprocess.CalledProcessError) as error: fw.fit() - assert 'Please provide a valid git repo url.' in str(error) + assert 'returned non-zero exit status' in str(error) def test_git_support_git_clone_fail(sagemaker_session): @@ -671,7 +673,7 @@ def test_git_support_git_clone_fail(sagemaker_session): train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) with pytest.raises(subprocess.CalledProcessError) as error: fw.fit() - assert "Command 'git clone" in str(error) + assert 'returned non-zero exit status' in str(error) def test_git_support_branch_not_exist(sagemaker_session): @@ -684,7 +686,7 @@ def test_git_support_branch_not_exist(sagemaker_session): enable_cloudwatch_metrics=True) with pytest.raises(subprocess.CalledProcessError) as error: fw.fit() - assert "Command 'git checkout" in str(error) + assert 'returned non-zero exit status' in str(error) def test_git_support_commit_not_exist(sagemaker_session): @@ -697,7 +699,7 @@ def test_git_support_commit_not_exist(sagemaker_session): enable_cloudwatch_metrics=True) with pytest.raises(subprocess.CalledProcessError) as error: fw.fit() - assert "Command 'git checkout" in str(error) + assert 'returned non-zero exit status' in str(error) def test_git_support_entry_point_not_exist(sagemaker_session): From 241ac92206c487bf47d3050b5aed20a4d959c271 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Fri, 14 Jun 2019 17:27:40 -0700 Subject: [PATCH 10/33] write unit tests for git_utils --- src/sagemaker/estimator.py | 72 ++------------ src/sagemaker/git_utils.py | 104 ++++++++++++++++++++ tests/integ/test_git.py | 5 +- tests/unit/test_estimator.py | 70 ++++++++------ tests/unit/test_git_utils.py | 177 +++++++++++++++++++++++++++++++++++ 5 files changed, 331 insertions(+), 97 deletions(-) create mode 100644 src/sagemaker/git_utils.py create mode 100644 tests/unit/test_git_utils.py diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index d734378f70..f2e40e91a4 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -15,8 +15,6 @@ import json import logging import os -import subprocess -import tempfile import warnings from abc import ABCMeta from abc import abstractmethod @@ -24,6 +22,7 @@ from six import string_types import sagemaker +import sagemaker.git_utils from sagemaker.analytics import TrainingJobAnalytics from sagemaker.fw_utils import (create_image_uri, tar_and_upload_dir, parse_s3_url, UploadedCode, validate_source_dir) @@ -876,69 +875,6 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, self._hyperparameters = hyperparameters or {} - def _git_clone_code(self): - """Git clone repo containing the training scripts and enter that directory. This method also validate - ``git_config``, and set ``entry_point`` and ``source_dir`` to the right file or directory in the repo cloned. - - Raises: - CalledProcessError: If 1. failed to clone git repo - 2. failed to checkout the required branch - 3. failed to checkout the required commit - ValueError: If 1. entry point specified does not exist in the repo - 2. source dir specified does not exist in the repo - """ - self._validate_git_config() - # create a temporary directory to store the cloned repo - repo_dir = tempfile.mkdtemp() - # try: - subprocess.check_call(['git', 'clone', self.git_config['repo'], repo_dir]) - # except subprocess.CalledProcessError: - # raise subprocess.CalledProcessError(1, cmd='git clone {} {}'.format(self.git_config['repo'], repo_dir)) - - self._checkout_branch_and_commit(repo_dir) - - # check if the cloned repo contains entry point, source directory and dependencies - if self.source_dir: - if not os.path.isdir(os.path.join(repo_dir, self.source_dir)): - raise ValueError('Source directory does not exist in the repo.') - if not os.path.isfile(os.path.join(repo_dir, self.source_dir, self.entry_point)): - raise ValueError('Entry point does not exist in the repo.') - self.source_dir = os.path.join(repo_dir, self.source_dir) - else: - if not os.path.isfile(os.path.join(repo_dir, self.entry_point)): - raise ValueError('Entry point does not exist in the repo.') - self.entry_point = os.path.join(repo_dir, self.entry_point) - dependencies = [] - for path in self.dependencies: - if not os.path.exists(os.path.join(repo_dir, path)): - raise ValueError('Dependency {} does not exist in the repo.'.format(path)) - dependencies.append(os.path.join(repo_dir, path)) - self.dependencies = dependencies - - def _validate_git_config(self): - """check if a git_config param is valid - - Raises: - ValueError: If 'git_config' has no key 'repo' - """ - if 'repo' not in self.git_config: - raise ValueError('Please provide a repo for git_config.') - - def _checkout_branch_and_commit(self, repo_dir): - """Enter the directory where the repo is cloned, and checkout the required branch and commit. - - Args: - repo_dir: the directory where the repo is cloned - - Raises: - ValueError: If 1. entry point specified does not exist in the repo - 2. source dir specified does not exist in the repo - """ - if 'branch' in self.git_config: - subprocess.check_call(['git', 'checkout', self.git_config['branch']], cwd=str(repo_dir)) - if 'commit' in self.git_config: - subprocess.check_call(['git', 'checkout', self.git_config['commit']], cwd=str(repo_dir)) - def _prepare_for_training(self, job_name=None): """Set hyperparameters needed for training. This method will also validate ``source_dir``. @@ -949,7 +885,11 @@ def _prepare_for_training(self, job_name=None): super(Framework, self)._prepare_for_training(job_name=job_name) if self.git_config: - self._git_clone_code() + updates = sagemaker.git_utils.git_clone_repo(self.git_config, self.entry_point, + self.source_dir, self.dependencies) + self.entry_point = updates['entry_point'] + self.source_dir = updates['source_dir'] + self.dependencies = updates['dependencies'] # validate source dir will raise a ValueError if there is something wrong with the # source directory. We are intentionally not handling it because this is a critical error. diff --git a/src/sagemaker/git_utils.py b/src/sagemaker/git_utils.py new file mode 100644 index 0000000000..7051300db3 --- /dev/null +++ b/src/sagemaker/git_utils.py @@ -0,0 +1,104 @@ +# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import os +import subprocess +import tempfile + + +def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None): + """Git clone repo containing the training code and serving code. This method also validate ``git_config``, + and set ``entry_point``, ``source_dir`` and ``dependencies`` to the right file or directory in the repo cloned. + + Args: + git_config (dict[str, str]): Git configurations used for cloning files, including ``repo``, ``branch`` + and ``commit``. ``branch`` and ``commit`` are optional. If ``branch`` is not specified, master branch + will be used. If ``commit`` is not specified, the latest commit in the required branch will be used. + entry_point (str): A relative location to the Python source file which should be executed as the entry point + to training or model hosting in the Git repo. + source_dir (str): A relative location to a directory with other training or model hosting source code + dependencies aside from the entry point file in the Git repo (default: None). Structure within this + directory are preserved when training on Amazon SageMaker. + dependencies (list[str]): A list of relative locations to directories with any additional libraries that will + be exported to the container in the Git repo (default: []). + + Raises: + CalledProcessError: If 1. failed to clone git repo + 2. failed to checkout the required branch + 3. failed to checkout the required commit + ValueError: If 1. entry point specified does not exist in the repo + 2. source dir specified does not exist in the repo + + Returns: + dict: A dict that contains the updated values of entry_point, source_dir and dependencies + """ + _validate_git_config(git_config) + repo_dir = tempfile.mkdtemp() + subprocess.check_call(['git', 'clone', git_config['repo'], repo_dir]) + + _checkout_branch_and_commit(git_config, repo_dir) + + ret = {'entry_point': entry_point, 'source_dir': source_dir, 'dependencies': dependencies} + # check if the cloned repo contains entry point, source directory and dependencies + if source_dir: + if not os.path.isdir(os.path.join(repo_dir, source_dir)): + raise ValueError('Source directory does not exist in the repo.') + if not os.path.isfile(os.path.join(repo_dir, source_dir, entry_point)): + raise ValueError('Entry point does not exist in the repo.') + ret['source_dir'] = os.path.join(repo_dir, source_dir) + else: + if not os.path.isfile(os.path.join(repo_dir, entry_point)): + raise ValueError('Entry point does not exist in the repo.') + ret['entry_point'] = os.path.join(repo_dir, entry_point) + + ret['dependencies'] = [] + for path in dependencies: + if not os.path.exists(os.path.join(repo_dir, path)): + raise ValueError('Dependency {} does not exist in the repo.'.format(path)) + ret['dependencies'].append(os.path.join(repo_dir, path)) + return ret + + +def _validate_git_config(git_config): + """check if a git_config param is valid + + Args: + git_config ((dict[str, str]): Git configurations used for cloning files, including ``repo``, ``branch`` + and ``commit``. + + Raises: + ValueError: If: + 1. git_config has no key 'repo' + 2. git_config['repo'] is in the wrong format. + """ + if 'repo' not in git_config: + raise ValueError('Please provide a repo for git_config.') + + +def _checkout_branch_and_commit(git_config, repo_dir): + """Checkout the required branch and commit. + + Args: + git_config: (dict[str, str]): Git configurations used for cloning files, including ``repo``, ``branch`` + and ``commit``. + repo_dir (str): the directory where the repo is cloned + + Raises: + ValueError: If 1. entry point specified does not exist in the repo + 2. source dir specified does not exist in the repo + """ + if 'branch' in git_config: + subprocess.check_call(args=['git', 'checkout', git_config['branch']], cwd=str(repo_dir)) + if 'commit' in git_config: + subprocess.check_call(args=['git', 'checkout', git_config['commit']], cwd=str(repo_dir)) diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index 4e27b1418d..e3c3f17c47 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -19,7 +19,6 @@ from sagemaker.mxnet.estimator import MXNet from sagemaker.mxnet.model import MXNetModel from sagemaker.pytorch.estimator import PyTorch -from sagemaker.pytorch.model import PyTorchModel from sagemaker.utils import sagemaker_timestamp from tests.integ import DATA_DIR, PYTHON_VERSION @@ -38,7 +37,7 @@ def test_git_support_with_pytorch(sagemaker_local_session): sagemaker_session=sagemaker_local_session, git_config=git_config) train_input = pytorch.sagemaker_session.upload_data(path=os.path.join(data_path, 'training'), - key_prefix='integ-test-data/pytorch_mnist/training') + key_prefix='integ-test-data/pytorch_mnist/training') pytorch.fit({'training': train_input}) predictor = pytorch.deploy(initial_instance_count=1, instance_type='local') @@ -62,7 +61,7 @@ def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version): train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), key_prefix='integ-test-data/mxnet_mnist/train') test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), - key_prefix='integ-test-data/mxnet_mnist/test') + key_prefix='integ-test-data/mxnet_mnist/test') mx.fit({'train': train_input, 'test': test_input}) diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 9458806a21..f5c5a9d88b 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -600,48 +600,54 @@ def test_prepare_for_training_force_name_generation(strftime, sagemaker_session) assert JOB_NAME == fw._current_job_name -def test_git_support_with_branch_and_commit_succeed(sagemaker_session): - git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} - fw = DummyFramework(entry_point='entry_point', git_config=git_config, - source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, - enable_cloudwatch_metrics=True) +@patch('sagemaker.git_utils.git_clone_repo') +def test_git_support_with_branch_and_commit_succeed(git_clone_repo, sagemaker_session): + git_clone_repo.side_effect = lambda git_config, entry_point, source_dir=None, dependencies=None: { + 'entry_point': '/tmp/repo_dir/entry_point', 'source_dir': None, 'dependencies': None} + git_conf = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + + fw = DummyFramework(entry_point='entry_point', git_config=git_conf, role=ROLE, + sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() - assert os.path.isfile(os.path.join(fw.source_dir, fw.entry_point)) - assert os.path.isdir(fw.source_dir) -def test_git_support_with_branch_succeed(sagemaker_session): - git_config = {'repo': GIT_REPO, 'branch': BRANCH} - fw = DummyFramework(entry_point='entry_point', git_config=git_config, - source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, +@patch('sagemaker.git_utils.git_clone_repo') +def test_git_support_with_branch_succeed(git_clone_repo, sagemaker_session): + git_clone_repo.side_effect = lambda git_config, entry_point, source_dir, dependencies=None: { + 'entry_point': '/tmp/repo_dir/source_dir/entry_point', 'source_dir': None, 'dependencies': None} + git_conf = {'repo': GIT_REPO, 'branch': BRANCH} + fw = DummyFramework(entry_point='entry_point', git_config=git_conf, + role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() - assert os.path.isfile(os.path.join(fw.source_dir, fw.entry_point)) - assert os.path.isdir(fw.source_dir) -def test_git_support_with_dependencies_succeed(sagemaker_session): - git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} - fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, +@patch('sagemaker.git_utils.git_clone_repo') +def test_git_support_with_dependencies_succeed(git_clone_repo, sagemaker_session): + git_clone_repo.side_effect = lambda git_config, entry_point, source_dir, dependencies: { + 'entry_point': '/tmp/repo_dir/source_dir/entry_point', + 'source_dir': None, + 'dependencies': ['/tmp/repo_dir/foo', '/tmp/repo_dir/foo/bar']} + git_conf = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_conf, dependencies=['foo', 'foo/bar'], role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() - assert os.path.isfile(fw.entry_point) - for item in fw.dependencies: - assert os.path.exists(item) -def test_git_support_without_branch_and_commit_succeed(sagemaker_session): - git_config = {'repo': GIT_REPO} - fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_config, +@patch('sagemaker.git_utils.git_clone_repo') +def test_git_support_without_branch_and_commit_succeed(git_clone_repo, sagemaker_session): + git_clone_repo.side_effect = lambda git_config, entry_point, source_dir, dependencies=None: { + 'entry_point': '/tmp/repo_dir/source_dir/entry_point', 'source_dir': None, 'dependencies': None} + git_conf = {'repo': GIT_REPO} + fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_conf, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() - assert os.path.isfile(fw.entry_point) def test_git_support_repo_not_provided(sagemaker_session): @@ -676,12 +682,14 @@ def test_git_support_git_clone_fail(sagemaker_session): assert 'returned non-zero exit status' in str(error) +@patch('sagemaker.git_utils.git_clone_repo', + side_effect=subprocess.CalledProcessError(returncode=1, cmd='git checkout branch-that-does-not-exist')) def test_git_support_branch_not_exist(sagemaker_session): git_config = {'repo': GIT_REPO, 'branch': 'branch-that-does-not-exist', 'commit': COMMIT} fw = DummyFramework(entry_point='entry_point', git_config=git_config, - source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, + role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) with pytest.raises(subprocess.CalledProcessError) as error: @@ -689,12 +697,14 @@ def test_git_support_branch_not_exist(sagemaker_session): assert 'returned non-zero exit status' in str(error) +@patch('sagemaker.git_utils.git_clone_repo', + side_effect=subprocess.CalledProcessError(returncode=1, cmd='commit-sha-that-does-not-exist')) def test_git_support_commit_not_exist(sagemaker_session): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': 'commit-sha-that-does-not-exist'} fw = DummyFramework(entry_point='entry_point', git_config=git_config, - source_dir='source_dir', role=ROLE, sagemaker_session=sagemaker_session, + role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) with pytest.raises(subprocess.CalledProcessError) as error: @@ -702,6 +712,8 @@ def test_git_support_commit_not_exist(sagemaker_session): assert 'returned non-zero exit status' in str(error) +@patch('sagemaker.git_utils.git_clone_repo', + side_effect=ValueError('Entry point does not exist in the repo.')) def test_git_support_entry_point_not_exist(sagemaker_session): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} fw = DummyFramework(entry_point='entry_point_that_does_not_exist', git_config=git_config, @@ -713,6 +725,8 @@ def test_git_support_entry_point_not_exist(sagemaker_session): assert 'Entry point does not exist in the repo.' in str(error) +@patch('sagemaker.git_utils.git_clone_repo', + side_effect=ValueError('Source directory does not exist in the repo.')) def test_git_support_source_dir_not_exist(sagemaker_session): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} fw = DummyFramework(entry_point='entry_point', git_config=git_config, @@ -724,6 +738,8 @@ def test_git_support_source_dir_not_exist(sagemaker_session): assert 'Source directory does not exist in the repo.' in str(error) +@patch('sagemaker.git_utils.git_clone_repo', + side_effect=ValueError('Dependency no-such-dir does not exist in the repo.')) def test_git_support_dependencies_not_exist(sagemaker_session): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} fw = DummyFramework(entry_point='entry_point', git_config=git_config, @@ -1414,5 +1430,3 @@ def test_encryption_flag_in_non_vpc_mode_invalid(sagemaker_session): estimator.fit() assert '"EnableInterContainerTrafficEncryption" and "VpcConfig" must be provided together' in str(error) - -################################################################################# diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py new file mode 100644 index 0000000000..dce38175e0 --- /dev/null +++ b/tests/unit/test_git_utils.py @@ -0,0 +1,177 @@ +# Copyright 2017-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import pytest +import subprocess +from mock import patch + +from sagemaker import git_utils + +REPO_DIR = '/tmp/repo_dir' +GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' +BRANCH = 'branch1' +COMMIT = 'b61c450200d6a309c8d24ac14b8adddc405acc56' + + +@patch('subprocess.check_call') +@patch('tempfile.mkdtemp', return_value=REPO_DIR) +@patch('sagemaker.git_utils._validate_git_config') +@patch('sagemaker.git_utils._checkout_branch_and_commit') +@patch('os.path.isfile', return_value=True) +@patch('os.path.isdir', return_value=True) +@patch('os.path.exists', return_value=True) +def test_git_clone_repo_succeed(exists, isdir, isfile, checkout_branch_and_commit, + validate_git_config, mkdtemp, check_call): + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point' + source_dir = 'source_dir' + dependencies = ['foo', 'bar'] + ret = git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) + validate_git_config.assert_called_with(git_config) + check_call.assert_called_with(['git', 'clone', git_config['repo'], REPO_DIR]) + mkdtemp.assert_called_once() + checkout_branch_and_commit.assert_called_with(git_config, REPO_DIR) + assert ret['entry_point'] == 'entry_point' + assert ret['source_dir'] == '/tmp/repo_dir/source_dir' + assert ret['dependencies'] == ['/tmp/repo_dir/foo', '/tmp/repo_dir/bar'] + + +@patch('subprocess.check_call') +@patch('tempfile.mkdtemp', return_value=REPO_DIR) +@patch('sagemaker.git_utils._validate_git_config', + side_effect=ValueError('Please provide a repo for git_config.')) +@patch('sagemaker.git_utils._checkout_branch_and_commit') +@patch('os.path.isfile', return_value=True) +@patch('os.path.isdir', return_value=True) +@patch('os.path.exists', return_value=True) +def test_git_clone_repo_repo_not_provided(exists, isdir, isfile, checkout_branch_and_commit, + validate_git_config, mkdtemp, check_call): + git_config = {'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point_that_does_not_exist' + source_dir = 'source_dir' + dependencies = ['foo', 'bar'] + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) + assert 'Please provide a repo for git_config.' in str(error) + + +@patch('subprocess.check_call', + side_effect=subprocess.CalledProcessError(returncode=1, cmd='git clone {} {}'.format(GIT_REPO, REPO_DIR))) +@patch('tempfile.mkdtemp', return_value=REPO_DIR) +@patch('sagemaker.git_utils._validate_git_config') +@patch('sagemaker.git_utils._checkout_branch_and_commit') +@patch('os.path.isfile', return_value=True) +@patch('os.path.isdir', return_value=True) +@patch('os.path.exists', return_value=True) +def test_git_clone_repo_clone_fail(exists, isdir, isfile, checkout_branch_and_commit, + validate_git_config, mkdtemp, check_call): + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point' + source_dir = 'source_dir' + dependencies = ['foo', 'bar'] + with pytest.raises(subprocess.CalledProcessError) as error: + git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) + assert 'returned non-zero exit status' in str(error) + + +@patch('subprocess.check_call') +@patch('tempfile.mkdtemp', return_value=REPO_DIR) +@patch('sagemaker.git_utils._validate_git_config') +@patch('sagemaker.git_utils._checkout_branch_and_commit', + side_effect=subprocess.CalledProcessError(returncode=1, cmd='git checkout {}'.format(BRANCH))) +@patch('os.path.isfile', return_value=True) +@patch('os.path.isdir', return_value=True) +@patch('os.path.exists', return_value=True) +def test_git_clone_repo_branch_not_exist(exists, isdir, isfile, checkout_branch_and_commit, + validate_git_config, mkdtemp, check_call): + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point' + source_dir = 'source_dir' + dependencies = ['foo', 'bar'] + with pytest.raises(subprocess.CalledProcessError) as error: + git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) + assert 'returned non-zero exit status' in str(error) + + +@patch('subprocess.check_call') +@patch('tempfile.mkdtemp', return_value=REPO_DIR) +@patch('sagemaker.git_utils._validate_git_config') +@patch('sagemaker.git_utils._checkout_branch_and_commit', + side_effect=subprocess.CalledProcessError(returncode=1, cmd='git checkout {}'.format(COMMIT))) +@patch('os.path.isfile', return_value=True) +@patch('os.path.isdir', return_value=True) +@patch('os.path.exists', return_value=True) +def test_git_clone_repo_commit_not_exist(exists, isdir, isfile, checkout_branch_and_commit, + validate_git_config, mkdtemp, check_call): + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point' + source_dir = 'source_dir' + dependencies = ['foo', 'bar'] + with pytest.raises(subprocess.CalledProcessError) as error: + git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) + assert 'returned non-zero exit status' in str(error) + + +@patch('subprocess.check_call') +@patch('tempfile.mkdtemp', return_value=REPO_DIR) +@patch('sagemaker.git_utils._validate_git_config') +@patch('sagemaker.git_utils._checkout_branch_and_commit') +@patch('os.path.isfile', return_value=False) +@patch('os.path.isdir', return_value=True) +@patch('os.path.exists', return_value=True) +def test_git_clone_repo_entry_point_not_exist(exists, isdir, isfile, checkout_branch_and_commit, + validate_git_config, mkdtemp, check_call): + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point_that_does_not_exist' + source_dir = 'source_dir' + dependencies = ['foo', 'bar'] + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) + assert 'Entry point does not exist in the repo.' in str(error) + + +@patch('subprocess.check_call') +@patch('tempfile.mkdtemp', return_value=REPO_DIR) +@patch('sagemaker.git_utils._validate_git_config') +@patch('sagemaker.git_utils._checkout_branch_and_commit') +@patch('os.path.isfile', return_value=True) +@patch('os.path.isdir', return_value=False) +@patch('os.path.exists', return_value=True) +def test_git_clone_repo_source_dir_not_exist(exists, isdir, isfile, checkout_branch_and_commit, + validate_git_config, mkdtemp, check_call): + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point' + source_dir = 'source_dir_that_does_not_exist' + dependencies = ['foo', 'bar'] + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) + assert 'Source directory does not exist in the repo.' in str(error) + + +@patch('subprocess.check_call') +@patch('tempfile.mkdtemp', return_value=REPO_DIR) +@patch('sagemaker.git_utils._validate_git_config') +@patch('sagemaker.git_utils._checkout_branch_and_commit') +@patch('os.path.isfile', return_value=True) +@patch('os.path.isdir', return_value=True) +@patch('os.path.exists', side_effect=[True, False]) +def test_git_clone_repo_dependencies_not_exist(exists, isdir, isfile, checkout_branch_and_commit, + validate_git_config, mkdtemp, check_call): + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point' + source_dir = 'source_dir' + dependencies = ['foo', 'dep_that_does_not_exist'] + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) + assert 'does not exist in the repo.' in str(error) From c39c344494dc5c9347026c8a16af113f0f5fcc75 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Fri, 14 Jun 2019 17:41:25 -0700 Subject: [PATCH 11/33] delete a line --- tests/unit/test_estimator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index f5c5a9d88b..6e9b559392 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -1429,4 +1429,3 @@ def test_encryption_flag_in_non_vpc_mode_invalid(sagemaker_session): encrypt_inter_container_traffic=True) estimator.fit() assert '"EnableInterContainerTrafficEncryption" and "VpcConfig" must be provided together' in str(error) - From a8f8731b778a00a0d2d85c8c4a4ebbfa42f4c9fa Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 17 Jun 2019 10:32:51 -0700 Subject: [PATCH 12/33] delete unnecessary comments --- tests/integ/test_git.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index ea4f6c6fbe..3288c36e4a 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -94,27 +94,3 @@ def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version): data = numpy.zeros(shape=(1, 1, 28, 28)) result = predictor.predict(data) assert result is not None - - -# def test_git_support_for_source_dirs_and_dependencies(sagemaker_local_session): -# source_dir = 'pytorch_source_dirs' -# lib = 'alexa.py' -# git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} -# -# with open(lib, 'w') as f: -# f.write('def question(to_anything): return 42') -# -# estimator = PyTorch(entry_point='train.py', role='SageMakerRole', source_dir=source_dir, -# dependencies=[lib], git_config=git_config, -# py_version=PYTHON_VERSION, train_instance_count=1, -# train_instance_type='local', -# sagemaker_session=sagemaker_local_session) -# estimator.fit() -# -# with local_mode_utils.lock(): -# try: -# predictor = estimator.deploy(initial_instance_count=1, instance_type='local') -# predict_response = predictor.predict([7]) -# assert predict_response == [49] -# finally: -# estimator.delete_endpoint() From 2b1622bf555c877e0e3c11fb19a37829a8b0e206 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 17 Jun 2019 11:42:59 -0700 Subject: [PATCH 13/33] add assertion to some test functions --- src/sagemaker/estimator.py | 6 +++--- tests/unit/test_estimator.py | 33 ++++++++++++++++++++------------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index f2e40e91a4..e0a46fde2b 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -22,7 +22,7 @@ from six import string_types import sagemaker -import sagemaker.git_utils +from sagemaker import git_utils from sagemaker.analytics import TrainingJobAnalytics from sagemaker.fw_utils import (create_image_uri, tar_and_upload_dir, parse_s3_url, UploadedCode, validate_source_dir) @@ -885,8 +885,8 @@ def _prepare_for_training(self, job_name=None): super(Framework, self)._prepare_for_training(job_name=job_name) if self.git_config: - updates = sagemaker.git_utils.git_clone_repo(self.git_config, self.entry_point, - self.source_dir, self.dependencies) + updates = git_utils.git_clone_repo(self.git_config, self.entry_point, + self.source_dir, self.dependencies) self.entry_point = updates['entry_point'] self.source_dir = updates['source_dir'] self.dependencies = updates['dependencies'] diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 6e9b559392..dd921e2cc4 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -602,52 +602,59 @@ def test_prepare_for_training_force_name_generation(strftime, sagemaker_session) @patch('sagemaker.git_utils.git_clone_repo') def test_git_support_with_branch_and_commit_succeed(git_clone_repo, sagemaker_session): - git_clone_repo.side_effect = lambda git_config, entry_point, source_dir=None, dependencies=None: { + git_clone_repo.side_effect = lambda gitconfig, entrypoint, source_dir=None, dependencies=None: { 'entry_point': '/tmp/repo_dir/entry_point', 'source_dir': None, 'dependencies': None} - git_conf = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} - - fw = DummyFramework(entry_point='entry_point', git_config=git_conf, role=ROLE, + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'entry_point' + fw = DummyFramework(entry_point=entry_point, git_config=git_config, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() + git_clone_repo.assert_called_once_with(git_config, entry_point, None, []) @patch('sagemaker.git_utils.git_clone_repo') def test_git_support_with_branch_succeed(git_clone_repo, sagemaker_session): - git_clone_repo.side_effect = lambda git_config, entry_point, source_dir, dependencies=None: { + git_clone_repo.side_effect = lambda gitconfig, entrypoint, source_dir, dependencies=None: { 'entry_point': '/tmp/repo_dir/source_dir/entry_point', 'source_dir': None, 'dependencies': None} - git_conf = {'repo': GIT_REPO, 'branch': BRANCH} - fw = DummyFramework(entry_point='entry_point', git_config=git_conf, + git_config = {'repo': GIT_REPO, 'branch': BRANCH} + entry_point = 'entry_point' + fw = DummyFramework(entry_point=entry_point, git_config=git_config, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() + git_clone_repo.assert_called_once_with(git_config, entry_point, None, []) @patch('sagemaker.git_utils.git_clone_repo') def test_git_support_with_dependencies_succeed(git_clone_repo, sagemaker_session): - git_clone_repo.side_effect = lambda git_config, entry_point, source_dir, dependencies: { + git_clone_repo.side_effect = lambda gitconfig, entrypoint, source_dir, dependencies: { 'entry_point': '/tmp/repo_dir/source_dir/entry_point', 'source_dir': None, 'dependencies': ['/tmp/repo_dir/foo', '/tmp/repo_dir/foo/bar']} - git_conf = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} - fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_conf, + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + entry_point = 'source_dir/entry_point' + fw = DummyFramework(entry_point=entry_point, git_config=git_config, dependencies=['foo', 'foo/bar'], role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() + git_clone_repo.assert_called_once_with(git_config, entry_point, None, ['foo', 'foo/bar']) @patch('sagemaker.git_utils.git_clone_repo') def test_git_support_without_branch_and_commit_succeed(git_clone_repo, sagemaker_session): - git_clone_repo.side_effect = lambda git_config, entry_point, source_dir, dependencies=None: { + git_clone_repo.side_effect = lambda gitconfig, entrypoint, source_dir, dependencies=None: { 'entry_point': '/tmp/repo_dir/source_dir/entry_point', 'source_dir': None, 'dependencies': None} - git_conf = {'repo': GIT_REPO} - fw = DummyFramework(entry_point='source_dir/entry_point', git_config=git_conf, + git_config = {'repo': GIT_REPO} + entry_point = 'source_dir/entry_point' + fw = DummyFramework(entry_point=entry_point, git_config=git_config, role=ROLE, sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True) fw.fit() + git_clone_repo.assert_called_once_with(git_config, entry_point, None, []) def test_git_support_repo_not_provided(sagemaker_session): From 28a5c5888900fd75649aa286e48f726218005c8e Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 17 Jun 2019 13:38:54 -0700 Subject: [PATCH 14/33] remove deploy part in test_git --- tests/integ/test_git.py | 20 +++----------------- tests/unit/test_git_utils.py | 6 +++--- 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index aaed1f4b7e..c6c4d009cd 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -22,9 +22,9 @@ from sagemaker.utils import sagemaker_timestamp from tests.integ import DATA_DIR, PYTHON_VERSION -GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' -BRANCH = 'branch1' -COMMIT = 'b61c450200d6a309c8d24ac14b8adddc405acc56' +GIT_REPO = 'https://github.com/GaryTu1020/sagemaker-python-sdk.git' +BRANCH = 'git_support_testing' +COMMIT = 'b8724a04ee00cb74c12c1b9a0c79d4f065c3801d' def test_git_support_with_pytorch(sagemaker_local_session): @@ -67,17 +67,3 @@ def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version): files = [file for file in os.listdir(mx.source_dir)] assert 'some_file' in files and 'mnist.py' in files - - endpoint_name = 'test-git_support-with-mxnet-{}'.format(sagemaker_timestamp()) - - script_abs_path = os.path.join(mx.source_dir, script_path) - desc = sagemaker_local_session.sagemaker_client.describe_training_job(mx.latest_training_job.name) - model_data = desc['ModelArtifacts']['S3ModelArtifacts'] - model = MXNetModel(model_data, 'SageMakerRole', entry_point=script_abs_path, - py_version=PYTHON_VERSION, sagemaker_session=sagemaker_local_session, - framework_version=mxnet_full_version) - predictor = model.deploy(1, 'local', endpoint_name=endpoint_name) - - data = numpy.zeros(shape=(1, 1, 28, 28)) - result = predictor.predict(data) - assert result is not None diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index dce38175e0..2fb60c76ea 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -19,9 +19,9 @@ from sagemaker import git_utils REPO_DIR = '/tmp/repo_dir' -GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' -BRANCH = 'branch1' -COMMIT = 'b61c450200d6a309c8d24ac14b8adddc405acc56' +GIT_REPO = 'https://github.com/GaryTu1020/sagemaker-python-sdk.git' +BRANCH = 'git_support_testing' +COMMIT = 'b8724a04ee00cb74c12c1b9a0c79d4f065c3801d' @patch('subprocess.check_call') From 0797060a2bbea49184fbddcc00f421afe77f537f Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 17 Jun 2019 14:51:28 -0700 Subject: [PATCH 15/33] change testing git repo --- tests/integ/test_git.py | 8 +++----- tests/unit/test_git_utils.py | 6 +++--- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index c6c4d009cd..645a3c613a 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -17,14 +17,12 @@ import numpy from sagemaker.mxnet.estimator import MXNet -from sagemaker.mxnet.model import MXNetModel from sagemaker.pytorch.estimator import PyTorch -from sagemaker.utils import sagemaker_timestamp from tests.integ import DATA_DIR, PYTHON_VERSION -GIT_REPO = 'https://github.com/GaryTu1020/sagemaker-python-sdk.git' -BRANCH = 'git_support_testing' -COMMIT = 'b8724a04ee00cb74c12c1b9a0c79d4f065c3801d' +GIT_REPO = 'https://github.com/aws/sagemaker-python-sdk.git' +BRANCH = 'test-branch-git-config' +COMMIT = '329bfcf884482002c05ff7f44f62599ebc9f445a' def test_git_support_with_pytorch(sagemaker_local_session): diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index 2fb60c76ea..2630b23d4b 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -19,9 +19,9 @@ from sagemaker import git_utils REPO_DIR = '/tmp/repo_dir' -GIT_REPO = 'https://github.com/GaryTu1020/sagemaker-python-sdk.git' -BRANCH = 'git_support_testing' -COMMIT = 'b8724a04ee00cb74c12c1b9a0c79d4f065c3801d' +GIT_REPO = 'https://github.com/aws/sagemaker-python-sdk.git' +BRANCH = 'test-branch-git-config' +COMMIT = '329bfcf884482002c05ff7f44f62599ebc9f445a' @patch('subprocess.check_call') From e2e5c207d5911c7945aa5fd07fe5f2c94b8379d9 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 17 Jun 2019 16:58:03 -0700 Subject: [PATCH 16/33] change the testing repo --- src/sagemaker/estimator.py | 6 +++--- tests/unit/test_estimator.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index e0a46fde2b..fa0cf5789e 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -803,9 +803,9 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, The following config: - >>> git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git', - >>> 'branch': 'master', - >>> 'commit': 'aea6f3acef9619f77f94772d9d654f041e16bf49'} + >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git', + >>> 'branch': 'test-branch-git-config', + >>> 'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'} results in cloning the repo specified in 'repo', then checkout the 'master' branch, and checkout the specified commit. diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index dd921e2cc4..6ae5c016de 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -48,9 +48,9 @@ JOB_NAME = '{}-{}'.format(IMAGE_NAME, TIMESTAMP) TAGS = [{'Name': 'some-tag', 'Value': 'value-for-tag'}] OUTPUT_PATH = 's3://bucket/prefix' -GIT_REPO = 'https://github.com/GaryTu1020/python-sdk-testing.git' -BRANCH = 'branch1' -COMMIT = 'b61c450200d6a309c8d24ac14b8adddc405acc56' +GIT_REPO = 'https://github.com/aws/sagemaker-python-sdk.git' +BRANCH = 'test-branch-git-config' +COMMIT = '329bfcf884482002c05ff7f44f62599ebc9f445a' DESCRIBE_TRAINING_JOB_RESULT = { 'ModelArtifacts': { From 347747765a9a4e0e68e0eeb31f70ef0f5e393710 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Tue, 18 Jun 2019 10:17:22 -0700 Subject: [PATCH 17/33] write some test cases for serving code --- src/sagemaker/model.py | 14 ++++++++++++++ tests/data/mxnet_mnist/MANIFEST.in | 5 +++++ tests/data/mxnet_mnist/setup.cfg | 3 +++ tests/data/mxnet_mnist/setup.py | 6 ++++++ tests/unit/test_model.py | 7 ++++++- 5 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 tests/data/mxnet_mnist/MANIFEST.in create mode 100644 tests/data/mxnet_mnist/setup.cfg create mode 100644 tests/data/mxnet_mnist/setup.py diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index 0bd5fdb5be..a7df8461aa 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -387,6 +387,20 @@ def __init__(self, model_data, image, role, entry_point, source_dir=None, predic >>> |----- test.py You can assign entry_point='src/inference.py'. + git_config (dict[str, str]): Git configurations used for cloning files, including 'repo', 'branch' + and 'commit' (default: None). + 'branch' and 'commit' are optional. If 'branch' is not specified, 'master' branch will be used. If + 'commit' is not specified, the latest commit in the required branch will be used. + Example: + + The following config: + + >>> git_config = {'repo': 'https://github.com/aws/sagemaker-python-sdk.git', + >>> 'branch': 'test-branch-git-config', + >>> 'commit': '329bfcf884482002c05ff7f44f62599ebc9f445a'} + + results in cloning the repo specified in 'repo', then checkout the 'master' branch, and checkout + the specified commit. source_dir (str): Path (absolute or relative) to a directory with any other training source code dependencies aside from the entry point file (default: None). Structure within this directory will be preserved when training on SageMaker. If 'git_config' is provided, diff --git a/tests/data/mxnet_mnist/MANIFEST.in b/tests/data/mxnet_mnist/MANIFEST.in new file mode 100644 index 0000000000..20c477388d --- /dev/null +++ b/tests/data/mxnet_mnist/MANIFEST.in @@ -0,0 +1,5 @@ + +recursive-include . * +recursive-exclude . __pycache__* +recursive-exclude . *.pyc +recursive-exclude . *.pyo diff --git a/tests/data/mxnet_mnist/setup.cfg b/tests/data/mxnet_mnist/setup.cfg new file mode 100644 index 0000000000..37672a0c0d --- /dev/null +++ b/tests/data/mxnet_mnist/setup.cfg @@ -0,0 +1,3 @@ + +[wheel] +universal = 1 diff --git a/tests/data/mxnet_mnist/setup.py b/tests/data/mxnet_mnist/setup.py new file mode 100644 index 0000000000..e9282bf8bc --- /dev/null +++ b/tests/data/mxnet_mnist/setup.py @@ -0,0 +1,6 @@ + +from setuptools import setup +setup(packages=[''], + name="mnist", + version='1.0.0', + include_package_data=True) diff --git a/tests/unit/test_model.py b/tests/unit/test_model.py index 1e813796d4..7bdbc6cc8e 100644 --- a/tests/unit/test_model.py +++ b/tests/unit/test_model.py @@ -105,6 +105,11 @@ def __init__(self, sagemaker_session, **kwargs): def create_predictor(self, endpoint_name): return RealTimePredictor(endpoint_name, sagemaker_session=self.sagemaker_session) +class DummyFrameworkModelGit(FrameworkModel): + + def __init__(self, entry_point, sagemaker_session, **kwargs): + super(DummyFrameworkModelGit, self).__init__(MODEL_DATA, MODEL_IMAGE, ROLE, ) + @pytest.fixture() def sagemaker_session(): @@ -423,4 +428,4 @@ def test_check_neo_region(sagemaker_session, tmpdir): assert model.check_neo_region(region_name) is False -def test_git_support_with \ No newline at end of file +# def test_git_support_with_ \ No newline at end of file From 7b53f20550d3942139bfd529851b8e639aaf4132 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Tue, 18 Jun 2019 14:26:00 -0700 Subject: [PATCH 18/33] write a test --- tests/unit/test_model.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_model.py b/tests/unit/test_model.py index 7bdbc6cc8e..1d3f0f3609 100644 --- a/tests/unit/test_model.py +++ b/tests/unit/test_model.py @@ -39,6 +39,9 @@ IMAGE_NAME = 'fakeimage' REGION = 'us-west-2' MODEL_NAME = '{}-{}'.format(MODEL_IMAGE, TIMESTAMP) +GIT_REPO = 'https://github.com/aws/sagemaker-python-sdk.git' +BRANCH = 'test-branch-git-config' +COMMIT = '329bfcf884482002c05ff7f44f62599ebc9f445a' DESCRIBE_MODEL_PACKAGE_RESPONSE = { @@ -105,10 +108,15 @@ def __init__(self, sagemaker_session, **kwargs): def create_predictor(self, endpoint_name): return RealTimePredictor(endpoint_name, sagemaker_session=self.sagemaker_session) -class DummyFrameworkModelGit(FrameworkModel): - def __init__(self, entry_point, sagemaker_session, **kwargs): - super(DummyFrameworkModelGit, self).__init__(MODEL_DATA, MODEL_IMAGE, ROLE, ) +class DummyFrameworkModelForGit(FrameworkModel): + + def __init__(self, sagemaker_session, entry_point, **kwargs): + super(DummyFrameworkModelForGit, self).__init__(MODEL_DATA, MODEL_IMAGE, ROLE, entry_point=entry_point, + sagemaker_session=sagemaker_session, **kwargs) + + def create_predictor(self, endpoint_name): + return RealTimePredictor(endpoint_name, sagemaker_session=self.sagemaker_session) @pytest.fixture() @@ -428,4 +436,20 @@ def test_check_neo_region(sagemaker_session, tmpdir): assert model.check_neo_region(region_name) is False -# def test_git_support_with_ \ No newline at end of file +@patch('sagemaker.git_utils.git_clone_repo') +@patch('sagemaker.model.FrameworkModel._upload_code') +def test_git_support_succeed(upload_code, git_clone_repo, sagemaker_session): + git_clone_repo.side_effect = lambda gitconfig, entrypoint, sourcedir, dependency=None: { + 'entry_point': 'entry_point', 'source_dir': '/tmp/repo_dir/source_dir', + 'dependencies': ['/tmp/repo_dir/foo', '/tmp/repo_dir/bar']} + entry_point = 'entry_point' + source_dir = 'source_dir' + dependencies = ['foo', 'bar'] + git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} + model = DummyFrameworkModelForGit(sagemaker_session=sagemaker_session, + entry_point=entry_point, + source_dir=source_dir, + dependencies=dependencies, + git_config=git_config) + model.deploy(instance_type=INSTANCE_TYPE, initial_instance_count=1) + git_clone_repo.assert_called_once_with(git_config, entry_point, source_dir, dependencies) From c6daa5dbb7590507cbbbbc588d0f912b5ad75e74 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Tue, 18 Jun 2019 14:35:59 -0700 Subject: [PATCH 19/33] correct an error message --- tests/unit/test_estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 6ae5c016de..1495307307 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -705,7 +705,7 @@ def test_git_support_branch_not_exist(sagemaker_session): @patch('sagemaker.git_utils.git_clone_repo', - side_effect=subprocess.CalledProcessError(returncode=1, cmd='commit-sha-that-does-not-exist')) + side_effect=subprocess.CalledProcessError(returncode=1, cmd='git checkout commit-sha-that-does-not-exist')) def test_git_support_commit_not_exist(sagemaker_session): git_config = {'repo': GIT_REPO, 'branch': BRANCH, From e5bd806e30644a3572007fe3cc72a795ae6e8675 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Tue, 18 Jun 2019 16:17:01 -0700 Subject: [PATCH 20/33] stop patching private methods --- .python-version | 1 + tests/integ/test_git.py | 36 ++++++++++++++--------- tests/unit/test_git_utils.py | 57 +++++++++++------------------------- 3 files changed, 40 insertions(+), 54 deletions(-) create mode 100644 .python-version diff --git a/.python-version b/.python-version new file mode 100644 index 0000000000..424e1794de --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.6.8 diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index 645a3c613a..859b19f4d7 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -34,15 +34,16 @@ def test_git_support_with_pytorch(sagemaker_local_session): train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session, git_config=git_config) - train_input = pytorch.sagemaker_session.upload_data(path=os.path.join(data_path, 'training'), - key_prefix='integ-test-data/pytorch_mnist/training') - pytorch.fit({'training': train_input}) + pytorch.fit({'training': 'file://' + os.path.join(data_path, 'training')}) - predictor = pytorch.deploy(initial_instance_count=1, instance_type='local') + try: + predictor = pytorch.deploy(initial_instance_count=1, instance_type='local') - data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32) - result = predictor.predict(data) - assert result is not None + data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32) + result = predictor.predict(data) + assert result is not None + finally: + predictor.delete_endpoint() def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version): @@ -56,12 +57,19 @@ def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version): train_instance_count=1, train_instance_type='local', sagemaker_session=sagemaker_local_session, git_config=git_config) - train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'), - key_prefix='integ-test-data/mxnet_mnist/train') - test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'), - key_prefix='integ-test-data/mxnet_mnist/test') - - mx.fit({'train': train_input, 'test': test_input}) + mx.fit({'train': 'file://' + os.path.join(data_path, 'train'), + 'test': 'file://' + os.path.join(data_path, 'test')}) files = [file for file in os.listdir(mx.source_dir)] - assert 'some_file' in files and 'mnist.py' in files + assert 'some_file' in files + assert 'mnist.py' in files + assert os.path.exists(mx.dependencies[0]) + + try: + predictor = mx.deploy(initial_instance_count=1, instance_type='local') + + data = numpy.zeros(shape=(1, 1, 28, 28)) + result = predictor.predict(data) + assert result is not None + finally: + predictor.delete_endpoint() diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index 2630b23d4b..3343adc5bd 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -26,22 +26,19 @@ @patch('subprocess.check_call') @patch('tempfile.mkdtemp', return_value=REPO_DIR) -@patch('sagemaker.git_utils._validate_git_config') -@patch('sagemaker.git_utils._checkout_branch_and_commit') @patch('os.path.isfile', return_value=True) @patch('os.path.isdir', return_value=True) @patch('os.path.exists', return_value=True) -def test_git_clone_repo_succeed(exists, isdir, isfile, checkout_branch_and_commit, - validate_git_config, mkdtemp, check_call): +def test_git_clone_repo_succeed(exists, isdir, isfile, mkdtemp, check_call): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} entry_point = 'entry_point' source_dir = 'source_dir' dependencies = ['foo', 'bar'] ret = git_utils.git_clone_repo(git_config, entry_point, source_dir, dependencies) - validate_git_config.assert_called_with(git_config) - check_call.assert_called_with(['git', 'clone', git_config['repo'], REPO_DIR]) + check_call.assert_any_call(['git', 'clone', git_config['repo'], REPO_DIR]) + check_call.assert_any_call(args=['git', 'checkout', BRANCH], cwd=REPO_DIR) + check_call.assert_any_call(args=['git', 'checkout', COMMIT], cwd=REPO_DIR) mkdtemp.assert_called_once() - checkout_branch_and_commit.assert_called_with(git_config, REPO_DIR) assert ret['entry_point'] == 'entry_point' assert ret['source_dir'] == '/tmp/repo_dir/source_dir' assert ret['dependencies'] == ['/tmp/repo_dir/foo', '/tmp/repo_dir/bar'] @@ -49,14 +46,10 @@ def test_git_clone_repo_succeed(exists, isdir, isfile, checkout_branch_and_commi @patch('subprocess.check_call') @patch('tempfile.mkdtemp', return_value=REPO_DIR) -@patch('sagemaker.git_utils._validate_git_config', - side_effect=ValueError('Please provide a repo for git_config.')) -@patch('sagemaker.git_utils._checkout_branch_and_commit') @patch('os.path.isfile', return_value=True) @patch('os.path.isdir', return_value=True) @patch('os.path.exists', return_value=True) -def test_git_clone_repo_repo_not_provided(exists, isdir, isfile, checkout_branch_and_commit, - validate_git_config, mkdtemp, check_call): +def test_git_clone_repo_repo_not_provided(exists, isdir, isfile, mkdtemp, check_call): git_config = {'branch': BRANCH, 'commit': COMMIT} entry_point = 'entry_point_that_does_not_exist' source_dir = 'source_dir' @@ -69,13 +62,10 @@ def test_git_clone_repo_repo_not_provided(exists, isdir, isfile, checkout_branch @patch('subprocess.check_call', side_effect=subprocess.CalledProcessError(returncode=1, cmd='git clone {} {}'.format(GIT_REPO, REPO_DIR))) @patch('tempfile.mkdtemp', return_value=REPO_DIR) -@patch('sagemaker.git_utils._validate_git_config') -@patch('sagemaker.git_utils._checkout_branch_and_commit') @patch('os.path.isfile', return_value=True) @patch('os.path.isdir', return_value=True) @patch('os.path.exists', return_value=True) -def test_git_clone_repo_clone_fail(exists, isdir, isfile, checkout_branch_and_commit, - validate_git_config, mkdtemp, check_call): +def test_git_clone_repo_clone_fail(exists, isdir, isfile, mkdtemp, check_call): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} entry_point = 'entry_point' source_dir = 'source_dir' @@ -85,16 +75,14 @@ def test_git_clone_repo_clone_fail(exists, isdir, isfile, checkout_branch_and_co assert 'returned non-zero exit status' in str(error) -@patch('subprocess.check_call') +@patch('subprocess.check_call', + side_effect=[True, + subprocess.CalledProcessError(returncode=1, cmd='git checkout banana')]) @patch('tempfile.mkdtemp', return_value=REPO_DIR) -@patch('sagemaker.git_utils._validate_git_config') -@patch('sagemaker.git_utils._checkout_branch_and_commit', - side_effect=subprocess.CalledProcessError(returncode=1, cmd='git checkout {}'.format(BRANCH))) @patch('os.path.isfile', return_value=True) @patch('os.path.isdir', return_value=True) @patch('os.path.exists', return_value=True) -def test_git_clone_repo_branch_not_exist(exists, isdir, isfile, checkout_branch_and_commit, - validate_git_config, mkdtemp, check_call): +def test_git_clone_repo_branch_not_exist(exists, isdir, isfile, mkdtemp, check_call): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} entry_point = 'entry_point' source_dir = 'source_dir' @@ -104,16 +92,14 @@ def test_git_clone_repo_branch_not_exist(exists, isdir, isfile, checkout_branch_ assert 'returned non-zero exit status' in str(error) -@patch('subprocess.check_call') +@patch('subprocess.check_call', + side_effect=[True, True, + subprocess.CalledProcessError(returncode=1, cmd='git checkout {}'.format(COMMIT))]) @patch('tempfile.mkdtemp', return_value=REPO_DIR) -@patch('sagemaker.git_utils._validate_git_config') -@patch('sagemaker.git_utils._checkout_branch_and_commit', - side_effect=subprocess.CalledProcessError(returncode=1, cmd='git checkout {}'.format(COMMIT))) @patch('os.path.isfile', return_value=True) @patch('os.path.isdir', return_value=True) @patch('os.path.exists', return_value=True) -def test_git_clone_repo_commit_not_exist(exists, isdir, isfile, checkout_branch_and_commit, - validate_git_config, mkdtemp, check_call): +def test_git_clone_repo_commit_not_exist(exists, isdir, isfile, mkdtemp, check_call): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} entry_point = 'entry_point' source_dir = 'source_dir' @@ -125,13 +111,10 @@ def test_git_clone_repo_commit_not_exist(exists, isdir, isfile, checkout_branch_ @patch('subprocess.check_call') @patch('tempfile.mkdtemp', return_value=REPO_DIR) -@patch('sagemaker.git_utils._validate_git_config') -@patch('sagemaker.git_utils._checkout_branch_and_commit') @patch('os.path.isfile', return_value=False) @patch('os.path.isdir', return_value=True) @patch('os.path.exists', return_value=True) -def test_git_clone_repo_entry_point_not_exist(exists, isdir, isfile, checkout_branch_and_commit, - validate_git_config, mkdtemp, check_call): +def test_git_clone_repo_entry_point_not_exist(exists, isdir, isfile, mkdtemp, check_call): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} entry_point = 'entry_point_that_does_not_exist' source_dir = 'source_dir' @@ -143,13 +126,10 @@ def test_git_clone_repo_entry_point_not_exist(exists, isdir, isfile, checkout_br @patch('subprocess.check_call') @patch('tempfile.mkdtemp', return_value=REPO_DIR) -@patch('sagemaker.git_utils._validate_git_config') -@patch('sagemaker.git_utils._checkout_branch_and_commit') @patch('os.path.isfile', return_value=True) @patch('os.path.isdir', return_value=False) @patch('os.path.exists', return_value=True) -def test_git_clone_repo_source_dir_not_exist(exists, isdir, isfile, checkout_branch_and_commit, - validate_git_config, mkdtemp, check_call): +def test_git_clone_repo_source_dir_not_exist(exists, isdir, isfile, mkdtemp, check_call): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} entry_point = 'entry_point' source_dir = 'source_dir_that_does_not_exist' @@ -161,13 +141,10 @@ def test_git_clone_repo_source_dir_not_exist(exists, isdir, isfile, checkout_bra @patch('subprocess.check_call') @patch('tempfile.mkdtemp', return_value=REPO_DIR) -@patch('sagemaker.git_utils._validate_git_config') -@patch('sagemaker.git_utils._checkout_branch_and_commit') @patch('os.path.isfile', return_value=True) @patch('os.path.isdir', return_value=True) @patch('os.path.exists', side_effect=[True, False]) -def test_git_clone_repo_dependencies_not_exist(exists, isdir, isfile, checkout_branch_and_commit, - validate_git_config, mkdtemp, check_call): +def test_git_clone_repo_dependencies_not_exist(exists, isdir, isfile, mkdtemp, check_call): git_config = {'repo': GIT_REPO, 'branch': BRANCH, 'commit': COMMIT} entry_point = 'entry_point' source_dir = 'source_dir' From 2af9b245add7e67b55a990a34f2ba5bebc3ab1fa Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Wed, 19 Jun 2019 13:24:23 -0700 Subject: [PATCH 21/33] slight change to overview.rst --- doc/overview.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/overview.rst b/doc/overview.rst index aa3ee3da38..a2885f8c9b 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -126,10 +126,10 @@ The following are some examples to define estimators with Git support: # In this example, the entry point 'mnist.py' is all we need for source code. # We need to specify the path to it in the Git repo. mx_estimator = MXNet(entry_point='mxnet/mnist.py', - role='SageMakerRole', - git_config=git_config, - train_instance_count=1, - train_instance_type='ml.c4.xlarge') + role='SageMakerRole', + git_config=git_config, + train_instance_count=1, + train_instance_type='ml.c4.xlarge') # In this example, besides entry point and other source code in source directory, we still need some # dependencies for the training job. Dependencies should also be paths inside the Git repo. From b102563e594dfcdf2be98715d9a3dd80295582a8 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Wed, 19 Jun 2019 13:48:57 -0700 Subject: [PATCH 22/33] add a comment for lock --- tests/integ/test_git.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index d4ab2436bc..71c6c1a321 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -25,6 +25,8 @@ GIT_REPO = 'https://github.com/aws/sagemaker-python-sdk.git' BRANCH = 'test-branch-git-config' COMMIT = '329bfcf884482002c05ff7f44f62599ebc9f445a' + +# endpoint tests all use the same port, so we use this lock to prevent concurrent execution LOCK_PATH = os.path.join(tempfile.gettempdir(), 'sagemaker_test_git_lock') From b6e75d026d10e34456a8d8059e613ab1dee0bbce Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Fri, 21 Jun 2019 16:14:30 -0700 Subject: [PATCH 23/33] merge with master --- src/sagemaker/estimator.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 945eed3bb5..3d6ba1c15c 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -780,7 +780,7 @@ class Framework(EstimatorBase): def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cloudwatch_metrics=False, container_log_level=logging.INFO, code_location=None, image_name=None, dependencies=None, - enable_network_isolation=False, **kwargs): + enable_network_isolation=False, git_config=False, **kwargs): """Base class initializer. Subclasses which override ``__init__`` should invoke ``super()`` Args: @@ -814,7 +814,6 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl the specified commit. source_dir (str): Path (absolute or relative) to a directory with any other training source code dependencies aside from the entry point file (default: None). Structure within this -<<<<<<< HEAD directory are preserved when training on Amazon SageMaker. If 'git_config' is provided, source_dir should be a relative location to a directory in the Git repo. Example: @@ -828,8 +827,6 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl and you need 'train.py' as entry point and 'test.py' as training source code as well, you can assign entry_point='train.py', source_dir='src'. -======= - directory are preserved when training on Amazon SageMaker. hyperparameters (dict): Hyperparameters that will be used for training (default: None). The hyperparameters are made accessible as a dict[str, str] to the training code on SageMaker. For convenience, this accepts other types for keys and values, but ``str()`` will be called @@ -845,7 +842,6 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cl image_name (str): An alternate image name to use instead of the official Sagemaker image for the framework. This is useful to run one of the Sagemaker supported frameworks with an image containing custom dependencies. ->>>>>>> f1d34ad4073f8d856ef9c596b491f8a4cd8ef31f dependencies (list[str]): A list of paths to directories (absolute or relative) with any additional libraries that will be exported to the container (default: []). The library folders will be copied to SageMaker in the same folder where the entrypoint is copied. From 3621bd484affa1bd80b9c6fbd53f2bd3d3754796 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Fri, 21 Jun 2019 16:19:40 -0700 Subject: [PATCH 24/33] merge with master --- src/sagemaker/estimator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 3d6ba1c15c..915e74c916 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -780,7 +780,7 @@ class Framework(EstimatorBase): def __init__(self, entry_point, source_dir=None, hyperparameters=None, enable_cloudwatch_metrics=False, container_log_level=logging.INFO, code_location=None, image_name=None, dependencies=None, - enable_network_isolation=False, git_config=False, **kwargs): + enable_network_isolation=False, git_config=None, **kwargs): """Base class initializer. Subclasses which override ``__init__`` should invoke ``super()`` Args: From affa29e00c17a299c3b806a1c46e3fe972e6c62b Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 24 Jun 2019 11:36:30 -0700 Subject: [PATCH 25/33] add documentation for serving --- doc/overview.rst | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/overview.rst b/doc/overview.rst index f6da2c1ba9..43747542a0 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -108,7 +108,7 @@ Here are ways to specify ``git_config``: # Only providing 'repo' is also allowed. If this is the case, latest commit in # 'master' branch will be used. - git_config = {'repo': 'https://github.com/username/repo-with-training-scripts.git' + git_config = {'repo': 'https://github.com/username/repo-with-training-scripts.git'} The following are some examples to define estimators with Git support: @@ -723,6 +723,23 @@ After that, invoke the ``deploy()`` method on the ``Model``: This returns a predictor the same way an ``Estimator`` does when ``deploy()`` is called. You can now get inferences just like with any other model deployed on Amazon SageMaker. +Git support is also available when you bring your own model, through which you can use infenrce scripts stored in your +Git repositories. The process is similar to using Git support for training jobs. You can simply provide ``git_config`` +when create the ``Model`` object, and let ``entry_point``, ``source_dir`` and ``dependencies`` (if needed) be relative +paths inside the Git repository: + +.. code:: python + + git_config = {'repo': 'https://github.com/username/repo-with-training-scripts.git', + 'branch': 'branch1', + 'commit': '4893e528afa4a790331e1b5286954f073b0f14a2'} + + sagemaker_model = MXNetModel(model_data='s3://path/to/model.tar.gz', + role='arn:aws:iam::accid:sagemaker-role', + entry_point='inference.py', + source_dir='mxnet', + git_config=git_config) + A full example is available in the `Amazon SageMaker examples repository `__. From 70ba4335858fc634e461c34bb25289da587175b3 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 24 Jun 2019 16:42:40 -0700 Subject: [PATCH 26/33] mock git_clone_repo for clone fail tests --- tests/unit/test_estimator.py | 7 ++++++- tests/unit/test_model.py | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 14da310670..7963e47ccf 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -64,7 +64,6 @@ DESCRIBE_TRAINING_JOB_RESULT = {"ModelArtifacts": {"S3ModelArtifacts": MODEL_DATA}} - RETURNED_JOB_DESCRIPTION = { "AlgorithmSpecification": { "TrainingInputMode": "File", @@ -898,6 +897,12 @@ def test_git_support_bad_repo_url_format(sagemaker_session): assert "returned non-zero exit status" in str(error) +@patch( + "sagemaker.git_utils.git_clone_repo", + side_effect=subprocess.CalledProcessError( + returncode=1, cmd="git clone https://github.com/aws/no-such-repo.git /tmp/repo_dir" + ), +) def test_git_support_git_clone_fail(sagemaker_session): git_config = {"repo": "https://github.com/aws/no-such-repo.git", "branch": BRANCH} fw = DummyFramework( diff --git a/tests/unit/test_model.py b/tests/unit/test_model.py index 6e467a0972..4420fccf0b 100644 --- a/tests/unit/test_model.py +++ b/tests/unit/test_model.py @@ -564,6 +564,12 @@ def test_git_support_repo_not_provided(sagemaker_session): assert "Please provide a repo for git_config." in str(error) +@patch( + "sagemaker.git_utils.git_clone_repo", + side_effect=subprocess.CalledProcessError( + returncode=1, cmd="git clone https://github.com/aws/no-such-repo.git /tmp/repo_dir" + ), +) def test_git_support_git_clone_fail(sagemaker_session): entry_point = "source_dir/entry_point" git_config = {"repo": "https://github.com/aws/no-such-repo.git", "branch": BRANCH} From 6453c91dfefc505f766f7b2c2eb2a01ebb20b29b Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Mon, 24 Jun 2019 17:04:33 -0700 Subject: [PATCH 27/33] delete unnecessary files --- src/sagemaker/estimator.py | 2 -- tests/data/mxnet_mnist/MANIFEST.in | 5 ----- tests/data/mxnet_mnist/setup.cfg | 3 --- tests/data/mxnet_mnist/setup.py | 3 --- 4 files changed, 13 deletions(-) delete mode 100644 tests/data/mxnet_mnist/MANIFEST.in delete mode 100644 tests/data/mxnet_mnist/setup.cfg delete mode 100644 tests/data/mxnet_mnist/setup.py diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 64aef7bc62..95f6d28ee1 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -15,7 +15,6 @@ import json import logging import os - import warnings from abc import ABCMeta from abc import abstractmethod @@ -25,7 +24,6 @@ import sagemaker from sagemaker import git_utils from sagemaker.analytics import TrainingJobAnalytics - from sagemaker.fw_utils import ( create_image_uri, tar_and_upload_dir, diff --git a/tests/data/mxnet_mnist/MANIFEST.in b/tests/data/mxnet_mnist/MANIFEST.in deleted file mode 100644 index 20c477388d..0000000000 --- a/tests/data/mxnet_mnist/MANIFEST.in +++ /dev/null @@ -1,5 +0,0 @@ - -recursive-include . * -recursive-exclude . __pycache__* -recursive-exclude . *.pyc -recursive-exclude . *.pyo diff --git a/tests/data/mxnet_mnist/setup.cfg b/tests/data/mxnet_mnist/setup.cfg deleted file mode 100644 index 37672a0c0d..0000000000 --- a/tests/data/mxnet_mnist/setup.cfg +++ /dev/null @@ -1,3 +0,0 @@ - -[wheel] -universal = 1 diff --git a/tests/data/mxnet_mnist/setup.py b/tests/data/mxnet_mnist/setup.py deleted file mode 100644 index c2b12666bc..0000000000 --- a/tests/data/mxnet_mnist/setup.py +++ /dev/null @@ -1,3 +0,0 @@ -from setuptools import setup - -setup(packages=[""], name="mnist", version="1.0.0", include_package_data=True) From 33045e891c275bc0c06abcd2362c30b798a9d05a Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Tue, 25 Jun 2019 15:04:12 -0700 Subject: [PATCH 28/33] change infence script for mxnet --- tests/integ/test_git.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index 254ce74f60..5885d7b7b2 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -25,9 +25,9 @@ from sagemaker.sklearn.model import SKLearnModel from tests.integ import DATA_DIR, PYTHON_VERSION -GIT_REPO = "https://github.com/aws/sagemaker-python-sdk.git" -BRANCH = "test-branch-git-config" -COMMIT = "329bfcf884482002c05ff7f44f62599ebc9f445a" +GIT_REPO = "https://github.com/GaryTu1020/python-sdk-testing.git" +BRANCH = "branch1" +COMMIT = "73538c46da0cbd522191c2fa39b194bae4a8050d" # endpoint tests all use the same port, so we use this lock to prevent concurrent execution LOCK_PATH = os.path.join(tempfile.gettempdir(), "sagemaker_test_git_lock") @@ -95,13 +95,14 @@ def test_git_support_with_mxnet(sagemaker_local_session): with lock.lock(LOCK_PATH): try: + serving_script_path = 'mnist_hosting_with_custom_handlers.py' client = sagemaker_local_session.sagemaker_client desc = client.describe_training_job(TrainingJobName=mx.latest_training_job.name) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] model = MXNetModel( model_data, "SageMakerRole", - entry_point=script_path, + entry_point=serving_script_path, source_dir=source_dir, dependencies=dependencies, py_version=PYTHON_VERSION, From 9df9582caee3bf28ac0c0644b279cb920c9211f4 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Wed, 26 Jun 2019 10:59:38 -0700 Subject: [PATCH 29/33] address pr commments --- src/sagemaker/estimator.py | 8 ++++---- src/sagemaker/git_utils.py | 20 ++++++++++++++------ src/sagemaker/model.py | 4 ++-- tests/integ/test_git.py | 8 ++++---- tests/unit/test_estimator.py | 2 +- tests/unit/test_git_utils.py | 10 +++++++++- tests/unit/test_model.py | 2 +- 7 files changed, 35 insertions(+), 19 deletions(-) diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py index 95f6d28ee1..fb7e455167 100644 --- a/src/sagemaker/estimator.py +++ b/src/sagemaker/estimator.py @@ -1083,12 +1083,12 @@ def _prepare_for_training(self, job_name=None): super(Framework, self)._prepare_for_training(job_name=job_name) if self.git_config: - updates = git_utils.git_clone_repo( + updated_paths = git_utils.git_clone_repo( self.git_config, self.entry_point, self.source_dir, self.dependencies ) - self.entry_point = updates["entry_point"] - self.source_dir = updates["source_dir"] - self.dependencies = updates["dependencies"] + self.entry_point = updated_paths["entry_point"] + self.source_dir = updated_paths["source_dir"] + self.dependencies = updated_paths["dependencies"] # validate source dir will raise a ValueError if there is something wrong with the # source directory. We are intentionally not handling it because this is a critical error. diff --git a/src/sagemaker/git_utils.py b/src/sagemaker/git_utils.py index b68442b7d5..e4c4a51260 100644 --- a/src/sagemaker/git_utils.py +++ b/src/sagemaker/git_utils.py @@ -43,13 +43,19 @@ def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None): Returns: dict: A dict that contains the updated values of entry_point, source_dir and dependencies """ + if entry_point is None: + raise ValueError("Please provide an entry point.") _validate_git_config(git_config) repo_dir = tempfile.mkdtemp() subprocess.check_call(["git", "clone", git_config["repo"], repo_dir]) _checkout_branch_and_commit(git_config, repo_dir) - ret = {"entry_point": entry_point, "source_dir": source_dir, "dependencies": dependencies} + updated_paths = { + "entry_point": entry_point, + "source_dir": source_dir, + "dependencies": dependencies, + } # check if the cloned repo contains entry point, source directory and dependencies if source_dir: @@ -57,20 +63,20 @@ def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None): raise ValueError("Source directory does not exist in the repo.") if not os.path.isfile(os.path.join(repo_dir, source_dir, entry_point)): raise ValueError("Entry point does not exist in the repo.") - ret["source_dir"] = os.path.join(repo_dir, source_dir) + updated_paths["source_dir"] = os.path.join(repo_dir, source_dir) else: if os.path.isfile(os.path.join(repo_dir, entry_point)): - ret["entry_point"] = os.path.join(repo_dir, entry_point) + updated_paths["entry_point"] = os.path.join(repo_dir, entry_point) else: raise ValueError("Entry point does not exist in the repo.") - ret["dependencies"] = [] + updated_paths["dependencies"] = [] for path in dependencies: if os.path.exists(os.path.join(repo_dir, path)): - ret["dependencies"].append(os.path.join(repo_dir, path)) + updated_paths["dependencies"].append(os.path.join(repo_dir, path)) else: raise ValueError("Dependency {} does not exist in the repo.".format(path)) - return ret + return updated_paths def _validate_git_config(git_config): @@ -85,6 +91,8 @@ def _validate_git_config(git_config): 1. git_config has no key 'repo' 2. git_config['repo'] is in the wrong format. """ + if git_config is None: + raise ValueError("") if "repo" not in git_config: raise ValueError("Please provide a repo for git_config.") diff --git a/src/sagemaker/model.py b/src/sagemaker/model.py index e6e062915d..3c44b2b35b 100644 --- a/src/sagemaker/model.py +++ b/src/sagemaker/model.py @@ -554,13 +554,13 @@ def __init__( Example: The following call - >>> Estimator(entry_point='inference.py', dependencies=['my/libs/common', 'virtual-env']) + >>> Estimator(entry_point='train.py', dependencies=['my/libs/common', 'virtual-env']) results in the following inside the container: >>> $ ls >>> opt/ml/code - >>> |------ inference.py + >>> |------ train.py >>> |------ common >>> |------ virtual-env diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index 5885d7b7b2..dfa2089329 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -25,9 +25,9 @@ from sagemaker.sklearn.model import SKLearnModel from tests.integ import DATA_DIR, PYTHON_VERSION -GIT_REPO = "https://github.com/GaryTu1020/python-sdk-testing.git" -BRANCH = "branch1" -COMMIT = "73538c46da0cbd522191c2fa39b194bae4a8050d" +GIT_REPO = "https://github.com/aws/sagemaker-python-sdk.git" +BRANCH = "test-branch-git-config" +COMMIT = "ae15c9d7d5b97ea95ea451e4662ee43da3401d73" # endpoint tests all use the same port, so we use this lock to prevent concurrent execution LOCK_PATH = os.path.join(tempfile.gettempdir(), "sagemaker_test_git_lock") @@ -95,7 +95,7 @@ def test_git_support_with_mxnet(sagemaker_local_session): with lock.lock(LOCK_PATH): try: - serving_script_path = 'mnist_hosting_with_custom_handlers.py' + serving_script_path = "mnist_hosting_with_custom_handlers.py" client = sagemaker_local_session.sagemaker_client desc = client.describe_training_job(TrainingJobName=mx.latest_training_job.name) model_data = desc["ModelArtifacts"]["S3ModelArtifacts"] diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 7963e47ccf..7305a2ecc0 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -50,7 +50,7 @@ OUTPUT_PATH = "s3://bucket/prefix" GIT_REPO = "https://github.com/aws/sagemaker-python-sdk.git" BRANCH = "test-branch-git-config" -COMMIT = "329bfcf884482002c05ff7f44f62599ebc9f445a" +COMMIT = "ae15c9d7d5b97ea95ea451e4662ee43da3401d73" DESCRIBE_TRAINING_JOB_RESULT = {"ModelArtifacts": {"S3ModelArtifacts": MODEL_DATA}} INSTANCE_TYPE = "c4.4xlarge" diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index 02fb2f43e1..a862e76704 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -21,7 +21,7 @@ REPO_DIR = "/tmp/repo_dir" GIT_REPO = "https://github.com/aws/sagemaker-python-sdk.git" BRANCH = "test-branch-git-config" -COMMIT = "329bfcf884482002c05ff7f44f62599ebc9f445a" +COMMIT = "ae15c9d7d5b97ea95ea451e4662ee43da3401d73" @patch("subprocess.check_call") @@ -44,6 +44,14 @@ def test_git_clone_repo_succeed(exists, isdir, isfile, mkdtemp, check_call): assert ret["dependencies"] == ["/tmp/repo_dir/foo", "/tmp/repo_dir/bar"] +def test_git_clone_repo_entry_point_not_provided(): + git_config = {"repo": GIT_REPO, "branch": BRANCH, "commit": COMMIT} + source_dir = "source_dir" + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(git_config=git_config, entry_point=None, source_dir=source_dir) + assert "Please provide an entry point." in str(error) + + @patch("subprocess.check_call") @patch("tempfile.mkdtemp", return_value=REPO_DIR) @patch("os.path.isfile", return_value=True) diff --git a/tests/unit/test_model.py b/tests/unit/test_model.py index 4420fccf0b..fa6253f925 100644 --- a/tests/unit/test_model.py +++ b/tests/unit/test_model.py @@ -42,7 +42,7 @@ MODEL_NAME = "{}-{}".format(MODEL_IMAGE, TIMESTAMP) GIT_REPO = "https://github.com/aws/sagemaker-python-sdk.git" BRANCH = "test-branch-git-config" -COMMIT = "329bfcf884482002c05ff7f44f62599ebc9f445a" +COMMIT = "ae15c9d7d5b97ea95ea451e4662ee43da3401d73" DESCRIBE_MODEL_PACKAGE_RESPONSE = { From e5262066bc562969856d209bc7d00ed2d3ed1489 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Tue, 2 Jul 2019 16:46:39 -0700 Subject: [PATCH 30/33] mark test_git_support_with_sklearn as local_mode --- tests/integ/test_git.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index ce8550d29e..f7c15ab447 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -122,6 +122,7 @@ def test_git_support_with_mxnet(sagemaker_local_session): predictor.delete_endpoint() +@pytest.mark.local_mode def test_git_support_with_sklearn(sagemaker_local_session, sklearn_full_version): script_path = "mnist.py" data_path = os.path.join(DATA_DIR, "sklearn_mnist") From b7e3236c15ea08da4c5920868341c695bbc533c9 Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Tue, 2 Jul 2019 17:53:51 -0700 Subject: [PATCH 31/33] skip the sklearn test for py2 since it is not supported --- .python-version | 1 + tests/integ/test_git.py | 1 + 2 files changed, 2 insertions(+) create mode 100644 .python-version diff --git a/.python-version b/.python-version new file mode 100644 index 0000000000..35b46aeaca --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +2.7.16 diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py index f7c15ab447..a941269122 100644 --- a/tests/integ/test_git.py +++ b/tests/integ/test_git.py @@ -122,6 +122,7 @@ def test_git_support_with_mxnet(sagemaker_local_session): predictor.delete_endpoint() +@pytest.mark.skipif(PYTHON_VERSION != "py3", reason="Scikit-learn image supports only python 3.") @pytest.mark.local_mode def test_git_support_with_sklearn(sagemaker_local_session, sklearn_full_version): script_path = "mnist.py" From 7bf31ff4d7bb51894f943da15b842030615edb9c Mon Sep 17 00:00:00 2001 From: Yue Tu Date: Tue, 2 Jul 2019 17:54:28 -0700 Subject: [PATCH 32/33] rm .python-version --- .python-version | 1 - 1 file changed, 1 deletion(-) delete mode 100644 .python-version diff --git a/.python-version b/.python-version deleted file mode 100644 index 35b46aeaca..0000000000 --- a/.python-version +++ /dev/null @@ -1 +0,0 @@ -2.7.16 From 54b44bb92f364d0e9ae6202cb2796ac024e729d1 Mon Sep 17 00:00:00 2001 From: GaryTu1020 <45720913+GaryTu1020@users.noreply.github.com> Date: Wed, 3 Jul 2019 10:51:09 -0700 Subject: [PATCH 33/33] Update doc/overview.rst correct typo. Co-Authored-By: Marcio Vinicius dos Santos --- doc/overview.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/overview.rst b/doc/overview.rst index 43747542a0..2607270221 100644 --- a/doc/overview.rst +++ b/doc/overview.rst @@ -723,7 +723,7 @@ After that, invoke the ``deploy()`` method on the ``Model``: This returns a predictor the same way an ``Estimator`` does when ``deploy()`` is called. You can now get inferences just like with any other model deployed on Amazon SageMaker. -Git support is also available when you bring your own model, through which you can use infenrce scripts stored in your +Git support is also available when you bring your own model, through which you can use inference scripts stored in your Git repositories. The process is similar to using Git support for training jobs. You can simply provide ``git_config`` when create the ``Model`` object, and let ``entry_point``, ``source_dir`` and ``dependencies`` (if needed) be relative paths inside the Git repository: