Skip to content

Add job_name parameter for local mode #424

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 10, 2018
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ CHANGELOG

* enhancement: Enable setting VPC config when creating/deploying models
* enhancement: Local Mode: accept short lived credentials with a warning message
* bug-fix: Local Mode: pass in job name as parameter for training environment variable

=======
1.11.1
Expand Down
4 changes: 2 additions & 2 deletions src/sagemaker/local/entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def __init__(self, container):
self.start_time = None
self.end_time = None

def start(self, input_data_config, hyperparameters):
def start(self, input_data_config, hyperparameters, job_name):
for channel in input_data_config:
if channel['DataSource'] and 'S3DataSource' in channel['DataSource']:
data_distribution = channel['DataSource']['S3DataSource']['S3DataDistributionType']
Expand All @@ -57,7 +57,7 @@ def start(self, input_data_config, hyperparameters):
self.start = datetime.datetime.now()
self.state = self._TRAINING

self.model_artifacts = self.container.train(input_data_config, hyperparameters)
self.model_artifacts = self.container.train(input_data_config, hyperparameters, job_name)
self.end = datetime.datetime.now()
self.state = self._COMPLETED

Expand Down
5 changes: 3 additions & 2 deletions src/sagemaker/local/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,13 @@ def __init__(self, instance_type, instance_count, image, sagemaker_session=None)
self.container_root = None
self.container = None

def train(self, input_data_config, hyperparameters):
def train(self, input_data_config, hyperparameters, job_name):
"""Run a training job locally using docker-compose.
Args:
input_data_config (dict): The Input Data Configuration, this contains data such as the
channels to be used for training.
hyperparameters (dict): The HyperParameters for the training job.
job_name (str): Name of the local training job being run.

Returns (str): Location of the trained model.
"""
Expand All @@ -109,7 +110,7 @@ def train(self, input_data_config, hyperparameters):

training_env_vars = {
REGION_ENV_NAME: self.sagemaker_session.boto_region_name,
TRAINING_JOB_NAME_ENV_NAME: json.loads(hyperparameters.get(sagemaker.model.JOB_NAME_PARAM_NAME)),
TRAINING_JOB_NAME_ENV_NAME: job_name,
}
compose_data = self._generate_compose_file('train', additional_volumes=volumes,
additional_env_vars=training_env_vars)
Expand Down
2 changes: 1 addition & 1 deletion src/sagemaker/local/local_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def create_training_job(self, TrainingJobName, AlgorithmSpecification, InputData
AlgorithmSpecification['TrainingImage'], self.sagemaker_session)
training_job = _LocalTrainingJob(container)
hyperparameters = kwargs['HyperParameters'] if 'HyperParameters' in kwargs else {}
training_job.start(InputDataConfig, hyperparameters)
training_job.start(InputDataConfig, hyperparameters, TrainingJobName)

LocalSagemakerClient._training_jobs[TrainingJobName] = training_job

Expand Down
61 changes: 55 additions & 6 deletions tests/unit/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
REGION = 'us-west-2'
BUCKET_NAME = 'mybucket'
EXPANDED_ROLE = 'arn:aws:iam::111111111111:role/ExpandedRole'
TRAINING_JOB_NAME = 'my-job'
INPUT_DATA_CONFIG = [
{
'ChannelName': 'a',
Expand All @@ -56,12 +57,16 @@
HYPERPARAMETERS = {'a': 1,
'b': json.dumps('bee'),
'sagemaker_submit_directory': json.dumps('s3://my_bucket/code'),
'sagemaker_job_name': json.dumps('my-job')}
'sagemaker_job_name': json.dumps(TRAINING_JOB_NAME)}

HYPERPARAMETERS_WITHOUT_JOB_NAME = {'a': 1,
'b': json.dumps('bee'),
'sagemaker_submit_directory': json.dumps('s3://my_bucket/code')}

LOCAL_CODE_HYPERPARAMETERS = {'a': 1,
'b': 2,
'sagemaker_submit_directory': json.dumps('file:///tmp/code'),
'sagemaker_job_name': json.dumps('my-job')}
'sagemaker_job_name': json.dumps(TRAINING_JOB_NAME)}


@pytest.fixture()
Expand Down Expand Up @@ -230,7 +235,51 @@ def test_train(_download_folder, _cleanup, popen, _stream_output, LocalSession,
instance_count = 2
image = 'my-image'
sagemaker_container = _SageMakerContainer('local', instance_count, image, sagemaker_session=sagemaker_session)
sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS)
sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS, TRAINING_JOB_NAME)

channel_dir = os.path.join(directories[1], 'b')
download_folder_calls = [call('my-own-bucket', 'prefix', channel_dir)]
_download_folder.assert_has_calls(download_folder_calls)

docker_compose_file = os.path.join(sagemaker_container.container_root, 'docker-compose.yaml')

call_args = popen.call_args[0][0]
assert call_args is not None

expected = ['docker-compose', '-f', docker_compose_file, 'up', '--build', '--abort-on-container-exit']
for i, v in enumerate(expected):
assert call_args[i] == v

with open(docker_compose_file, 'r') as f:
config = yaml.load(f)
assert len(config['services']) == instance_count
for h in sagemaker_container.hosts:
assert config['services'][h]['image'] == image
assert config['services'][h]['command'] == 'train'
assert 'AWS_REGION={}'.format(REGION) in config['services'][h]['environment']
assert 'TRAINING_JOB_NAME={}'.format(TRAINING_JOB_NAME) in config['services'][h]['environment']

# assert that expected by sagemaker container output directories exist
assert os.path.exists(os.path.join(sagemaker_container.container_root, 'output'))
assert os.path.exists(os.path.join(sagemaker_container.container_root, 'output/data'))


@patch('sagemaker.local.local_session.LocalSession')
@patch('sagemaker.local.image._stream_output')
@patch('subprocess.Popen')
@patch('sagemaker.local.image._SageMakerContainer._cleanup')
@patch('sagemaker.local.image._SageMakerContainer._download_folder')
def test_train_with_hyperparameters_without_job_name(_download_folder, _cleanup, popen, _stream_output, LocalSession,
tmpdir, sagemaker_session):

directories = [str(tmpdir.mkdir('container-root')), str(tmpdir.mkdir('data'))]
with patch('sagemaker.local.image._SageMakerContainer._create_tmp_folder',
side_effect=directories):

instance_count = 2
image = 'my-image'
sagemaker_container = _SageMakerContainer('local', instance_count, image, sagemaker_session=sagemaker_session)
sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS_WITHOUT_JOB_NAME, TRAINING_JOB_NAME)

channel_dir = os.path.join(directories[1], 'b')
download_folder_calls = [call('my-own-bucket', 'prefix', channel_dir)]
Expand All @@ -252,7 +301,7 @@ def test_train(_download_folder, _cleanup, popen, _stream_output, LocalSession,
assert config['services'][h]['image'] == image
assert config['services'][h]['command'] == 'train'
assert 'AWS_REGION={}'.format(REGION) in config['services'][h]['environment']
assert 'TRAINING_JOB_NAME=my-job' in config['services'][h]['environment']
assert 'TRAINING_JOB_NAME={}'.format(TRAINING_JOB_NAME) in config['services'][h]['environment']

# assert that expected by sagemaker container output directories exist
assert os.path.exists(os.path.join(sagemaker_container.container_root, 'output'))
Expand All @@ -273,7 +322,7 @@ def test_train_error(_download_folder, _cleanup, popen, _stream_output, LocalSes
sagemaker_container = _SageMakerContainer('local', instance_count, image, sagemaker_session=sagemaker_session)

with pytest.raises(RuntimeError) as e:
sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS)
sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS, TRAINING_JOB_NAME)

assert 'this is expected' in str(e)

Expand All @@ -293,7 +342,7 @@ def test_train_local_code(_download_folder, _cleanup, popen, _stream_output,
sagemaker_container = _SageMakerContainer('local', instance_count, image,
sagemaker_session=sagemaker_session)

sagemaker_container.train(INPUT_DATA_CONFIG, LOCAL_CODE_HYPERPARAMETERS)
sagemaker_container.train(INPUT_DATA_CONFIG, LOCAL_CODE_HYPERPARAMETERS, TRAINING_JOB_NAME)

docker_compose_file = os.path.join(sagemaker_container.container_root,
'docker-compose.yaml')
Expand Down