diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 2f62b51566..d6adbe5089 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,6 +8,7 @@ CHANGELOG * enhancement: Enable setting VPC config when creating/deploying models * enhancement: Local Mode: accept short lived credentials with a warning message +* bug-fix: Local Mode: pass in job name as parameter for training environment variable ======= 1.11.1 diff --git a/src/sagemaker/local/entities.py b/src/sagemaker/local/entities.py index 38c64cec24..a67c14acb1 100644 --- a/src/sagemaker/local/entities.py +++ b/src/sagemaker/local/entities.py @@ -41,7 +41,7 @@ def __init__(self, container): self.start_time = None self.end_time = None - def start(self, input_data_config, hyperparameters): + def start(self, input_data_config, hyperparameters, job_name): for channel in input_data_config: if channel['DataSource'] and 'S3DataSource' in channel['DataSource']: data_distribution = channel['DataSource']['S3DataSource']['S3DataDistributionType'] @@ -57,7 +57,7 @@ def start(self, input_data_config, hyperparameters): self.start = datetime.datetime.now() self.state = self._TRAINING - self.model_artifacts = self.container.train(input_data_config, hyperparameters) + self.model_artifacts = self.container.train(input_data_config, hyperparameters, job_name) self.end = datetime.datetime.now() self.state = self._COMPLETED diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py index da4313af74..1a845b38f6 100644 --- a/src/sagemaker/local/image.py +++ b/src/sagemaker/local/image.py @@ -79,12 +79,13 @@ def __init__(self, instance_type, instance_count, image, sagemaker_session=None) self.container_root = None self.container = None - def train(self, input_data_config, hyperparameters): + def train(self, input_data_config, hyperparameters, job_name): """Run a training job locally using docker-compose. Args: input_data_config (dict): The Input Data Configuration, this contains data such as the channels to be used for training. hyperparameters (dict): The HyperParameters for the training job. + job_name (str): Name of the local training job being run. Returns (str): Location of the trained model. """ @@ -109,7 +110,7 @@ def train(self, input_data_config, hyperparameters): training_env_vars = { REGION_ENV_NAME: self.sagemaker_session.boto_region_name, - TRAINING_JOB_NAME_ENV_NAME: json.loads(hyperparameters.get(sagemaker.model.JOB_NAME_PARAM_NAME)), + TRAINING_JOB_NAME_ENV_NAME: job_name, } compose_data = self._generate_compose_file('train', additional_volumes=volumes, additional_env_vars=training_env_vars) diff --git a/src/sagemaker/local/local_session.py b/src/sagemaker/local/local_session.py index 2b564c8159..157eaf9b40 100644 --- a/src/sagemaker/local/local_session.py +++ b/src/sagemaker/local/local_session.py @@ -72,7 +72,7 @@ def create_training_job(self, TrainingJobName, AlgorithmSpecification, InputData AlgorithmSpecification['TrainingImage'], self.sagemaker_session) training_job = _LocalTrainingJob(container) hyperparameters = kwargs['HyperParameters'] if 'HyperParameters' in kwargs else {} - training_job.start(InputDataConfig, hyperparameters) + training_job.start(InputDataConfig, hyperparameters, TrainingJobName) LocalSagemakerClient._training_jobs[TrainingJobName] = training_job diff --git a/tests/unit/test_image.py b/tests/unit/test_image.py index afbd241533..d959329b5e 100644 --- a/tests/unit/test_image.py +++ b/tests/unit/test_image.py @@ -32,6 +32,7 @@ REGION = 'us-west-2' BUCKET_NAME = 'mybucket' EXPANDED_ROLE = 'arn:aws:iam::111111111111:role/ExpandedRole' +TRAINING_JOB_NAME = 'my-job' INPUT_DATA_CONFIG = [ { 'ChannelName': 'a', @@ -55,13 +56,12 @@ ] HYPERPARAMETERS = {'a': 1, 'b': json.dumps('bee'), - 'sagemaker_submit_directory': json.dumps('s3://my_bucket/code'), - 'sagemaker_job_name': json.dumps('my-job')} + 'sagemaker_submit_directory': json.dumps('s3://my_bucket/code')} + LOCAL_CODE_HYPERPARAMETERS = {'a': 1, 'b': 2, - 'sagemaker_submit_directory': json.dumps('file:///tmp/code'), - 'sagemaker_job_name': json.dumps('my-job')} + 'sagemaker_submit_directory': json.dumps('file:///tmp/code')} @pytest.fixture() @@ -230,7 +230,7 @@ def test_train(_download_folder, _cleanup, popen, _stream_output, LocalSession, instance_count = 2 image = 'my-image' sagemaker_container = _SageMakerContainer('local', instance_count, image, sagemaker_session=sagemaker_session) - sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS) + sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS, TRAINING_JOB_NAME) channel_dir = os.path.join(directories[1], 'b') download_folder_calls = [call('my-own-bucket', 'prefix', channel_dir)] @@ -252,13 +252,36 @@ def test_train(_download_folder, _cleanup, popen, _stream_output, LocalSession, assert config['services'][h]['image'] == image assert config['services'][h]['command'] == 'train' assert 'AWS_REGION={}'.format(REGION) in config['services'][h]['environment'] - assert 'TRAINING_JOB_NAME=my-job' in config['services'][h]['environment'] + assert 'TRAINING_JOB_NAME={}'.format(TRAINING_JOB_NAME) in config['services'][h]['environment'] # assert that expected by sagemaker container output directories exist assert os.path.exists(os.path.join(sagemaker_container.container_root, 'output')) assert os.path.exists(os.path.join(sagemaker_container.container_root, 'output/data')) +@patch('sagemaker.local.local_session.LocalSession') +@patch('sagemaker.local.image._stream_output') +@patch('sagemaker.local.image._SageMakerContainer._cleanup') +@patch('sagemaker.local.image._SageMakerContainer._download_folder') +def test_train_with_hyperparameters_without_job_name(_download_folder, _cleanup, _stream_output, LocalSession, tmpdir): + + directories = [str(tmpdir.mkdir('container-root')), str(tmpdir.mkdir('data'))] + with patch('sagemaker.local.image._SageMakerContainer._create_tmp_folder', + side_effect=directories): + + instance_count = 2 + image = 'my-image' + sagemaker_container = _SageMakerContainer('local', instance_count, image, sagemaker_session=LocalSession) + sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS, TRAINING_JOB_NAME) + + docker_compose_file = os.path.join(sagemaker_container.container_root, 'docker-compose.yaml') + + with open(docker_compose_file, 'r') as f: + config = yaml.load(f) + for h in sagemaker_container.hosts: + assert 'TRAINING_JOB_NAME={}'.format(TRAINING_JOB_NAME) in config['services'][h]['environment'] + + @patch('sagemaker.local.local_session.LocalSession') @patch('sagemaker.local.image._stream_output', side_effect=RuntimeError('this is expected')) @patch('subprocess.Popen') @@ -273,7 +296,7 @@ def test_train_error(_download_folder, _cleanup, popen, _stream_output, LocalSes sagemaker_container = _SageMakerContainer('local', instance_count, image, sagemaker_session=sagemaker_session) with pytest.raises(RuntimeError) as e: - sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS) + sagemaker_container.train(INPUT_DATA_CONFIG, HYPERPARAMETERS, TRAINING_JOB_NAME) assert 'this is expected' in str(e) @@ -293,7 +316,7 @@ def test_train_local_code(_download_folder, _cleanup, popen, _stream_output, sagemaker_container = _SageMakerContainer('local', instance_count, image, sagemaker_session=sagemaker_session) - sagemaker_container.train(INPUT_DATA_CONFIG, LOCAL_CODE_HYPERPARAMETERS) + sagemaker_container.train(INPUT_DATA_CONFIG, LOCAL_CODE_HYPERPARAMETERS, TRAINING_JOB_NAME) docker_compose_file = os.path.join(sagemaker_container.container_root, 'docker-compose.yaml')